64 changed files with 56710 additions and 2156 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -1,8 +0,0 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Datasource local storage ignored files
 /../../../../../:\hkn\project_folder\Gencoding3\.idea/dataSources/
 /dataSources.local.xml
 # Editor-based HTTP Client requests
 /httpRequests/
--- a/.idea/Gencoding3.iml
+++ b/.idea/Gencoding3.iml
@ -1,17 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
    </content>
    <orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
    <option name="format" value="PLAIN" />
    <option name="myDocStringFormat" value="Plain" />
  </component>
  <component name="TestRunnerService">
    <option name="PROJECT_TEST_RUNNER" value="pytest" />
  </component>
 </module>
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
@ -1,21 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
    <serverData>
      <paths name="304">
        <serverdata>
          <mappings>
            <mapping local="$PROJECT_DIR$" web="/" />
          </mappings>
        </serverdata>
      </paths>
      <paths name="root@region-42.seetacloud.com:58034 password">
        <serverdata>
          <mappings>
            <mapping local="$PROJECT_DIR$" web="/" />
          </mappings>
        </serverdata>
      </paths>
    </serverData>
  </component>
 </project>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -1,24 +0,0 @@
 <component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="PyChainedComparisonsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoreConstantInTheMiddle" value="true" />
    </inspection_tool>
    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="E501" />
        </list>
      </option>
    </inspection_tool>
    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="N806" />
          <option value="N802" />
          <option value="N803" />
        </list>
      </option>
    </inspection_tool>
  </profile>
 </component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -1,6 +0,0 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,4 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -1,8 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/Gencoding3.iml" filepath="$PROJECT_DIR$/.idea/Gencoding3.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -1,6 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
 </project>
--- a/Genius3/beautified_sample.json
+++ b/Genius3/beautified_sample.json
@ -1,623 +0,0 @@
 {
    "function_edges": [
        [
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1,
            1
        ],
        [
            0,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
            23,
            24,
            25,
            26
        ]
    ], // 看FCG，所有函数之间连接边的出边函数index和入边函数index
    "acfg_list": [ // 对应 data.raw_graph_list
        { // 一个CFG对应 data.raw_graph_list[a]
            "block_number": 3, // CFG中基本块的个数 √ data.raw_graph_list[a].g.__len__()
            "block_edges": [
                [
                    0,
                    0,
                    1,
                    1
                ],
                [
                    0,
                    2,
                    0,
                    2
                ]
            ], // 中间那个块才是第0块，不知道为什么；第一个数组是所有边的出块号，第二个数组是所有边的入块号 √ data.raw_graph_list[a].g.edges
            "block_features": [ // 每个基本块的属性
                [
                    0,
                    2,
                    1,
                    0,
                    7,
                    0,
                    1,
                    1,
                    4,
                    0,
                    0
                ], // 每个块的属性特征，属性特征为11维向量，具体是调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
                [
                    0,
                    2,
                    0,
                    0,
                    3,
                    1,
                    0,
                    1,
                    0,
                    0,
                    0
                ],
                [
                    1,
                    0,
                    0,
                    0,
                    1,
                    0,
                    0,
                    0,
                    0,
                    1,
                    0
                ]
            ]
        },
        {
            "block_number": 29, // CFG中基本块的个数
            "block_edges": [
                [
                    0,
                    1,
                    1,
                    2,
                    2,
                    3,
                    3,
                    4,
                    5,
                    6,
                    6,
                    7,
                    7,
                    8,
                    8,
                    9,
                    9,
                    10,
                    10,
                    11,
                    12,
                    12,
                    13,
                    14,
                    14,
                    15,
                    16,
                    17,
                    18,
                    19,
                    19,
                    20,
                    20,
                    21,
                    21,
                    23,
                    24,
                    24,
                    26,
                    26,
                    27,
                    28
                ],
                [
                    16,
                    0,
                    2,
                    0,
                    4,
                    1,
                    3,
                    3,
                    3,
                    25,
                    15,
                    8,
                    6,
                    6,
                    7,
                    28,
                    12,
                    9,
                    23,
                    16,
                    25,
                    11,
                    21,
                    17,
                    13,
                    19,
                    22,
                    14,
                    19,
                    18,
                    27,
                    24,
                    23,
                    26,
                    21,
                    22,
                    25,
                    10,
                    25,
                    5,
                    14,
                    8
                ]
            ],
            "block_features": [
                [
                    8,
                    2,
                    1,
                    5,
                    36,
                    0,
                    6,
                    0,
                    2,
                    0,
                    0
                ],
                [
                    0,
                    7,
                    0,
                    0,
                    3,
                    0,
                    1,
                    1,
                    1,
                    0,
                    0
                ],
                [
                    0,
                    7,
                    0,
                    0,
                    2,
                    0,
                    1,
                    1,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    7,
                    0,
                    1,
                    8,
                    1,
                    2,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    7,
                    1,
                    0,
                    2,
                    0,
                    1,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    7,
                    0,
                    0,
                    1,
                    0,
                    0,
                    0,
                    1,
                    0,
                    0
                ],
                [
                    1,
                    18,
                    0,
                    1,
                    9,
                    0,
                    2,
                    1,
                    1,
                    0,
                    0
                ],
                [
                    1,
                    21,
                    1,
                    0,
                    3,
                    0,
                    1,
                    1,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    21,
                    0,
                    1,
                    4,
                    1,
                    2,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    24,
                    0,
                    2,
                    12,
                    1,
                    3,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    1,
                    26,
                    0,
                    3,
                    16,
                    0,
                    4,
                    1,
                    4,
                    0,
                    0
                ],
                [
                    1,
                    2,
                    0,
                    5,
                    22,
                    0,
                    5,
                    0,
                    1,
                    0,
                    0
                ],
                [
                    5,
                    4,
                    1,
                    3,
                    21,
                    0,
                    4,
                    1,
                    3,
                    0,
                    0
                ],
                [
                    4,
                    11,
                    0,
                    2,
                    17,
                    1,
                    2,
                    0,
                    1,
                    0,
                    0
                ],
                [
                    2,
                    14,
                    0,
                    1,
                    12,
                    0,
                    2,
                    1,
                    1,
                    0,
                    0
                ],
                [
                    3,
                    17,
                    0,
                    0,
                    10,
                    0,
                    1,
                    0,
                    1,
                    0,
                    0
                ],
                [
                    1,
                    1,
                    0,
                    1,
                    5,
                    0,
                    2,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    14,
                    0,
                    0,
                    1,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    3,
                    17,
                    0,
                    0,
                    7,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    17,
                    0,
                    1,
                    5,
                    0,
                    2,
                    1,
                    1,
                    0,
                    0
                ],
                [
                    2,
                    28,
                    1,
                    1,
                    11,
                    1,
                    2,
                    1,
                    1,
                    0,
                    0
                ],
                [
                    0,
                    11,
                    0,
                    1,
                    8,
                    1,
                    2,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    0,
                    0,
                    0,
                    1,
                    1,
                    0,
                    1,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    1,
                    1,
                    0,
                    0,
                    1,
                    0,
                    0,
                    0,
                    0,
                    0,
                    0
                ],
                [
                    12,
                    27,
                    1,
                    7,
                    41,
                    0,
                    8,
                    1,
                    6,
                    0,
                    0
                ],
                [
                    0,
                    0,
                    1,
                    0,
                    7,
                    1,
                    0,
                    0,
                    0,
                    1,
                    0
                ],
                [
                    2,
                    9,
                    0,
                    2,
                    17,
                    0,
                    3,
                    1,
                    3,
                    0,
                    0
                ],
                [
                    2,
                    14,
                    0,
                    0,
                    5,
                    0,
                    1,
                    0,
                    4,
                    0,
                    0
                ],
                [
                    1,
                    21,
                    4,
                    1,
                    13,
                    0,
                    2,
                    0,
                    5,
                    0,
                    0
                ]
            ]
        }
    ],
    "function_names": [ // 包括外部函数和局部函数的函数名
        "sub_401000",
        "start",
        "GetTempPathW",
        "GetFileSize",
        "GetCurrentDirectoryW",
        "DeleteFileW",
        "CloseHandle",
        "WriteFile",
        "lstrcmpW",
        "ReadFile",
        "GetModuleHandleW",
        "ExitProcess",
        "HeapCreate",
        "HeapAlloc",
        "GetModuleFileNameW",
        "CreateFileW",
        "lstrlenW",
        "ShellExecuteW",
        "wsprintfW",
        "HttpSendRequestW",
        "InternetSetOptionW",
        "InternetQueryOptionW",
        "HttpOpenRequestW",
        "HttpQueryInfoW",
        "InternetReadFile",
        "InternetConnectW",
        "InternetOpenW"
    ], // √
    "hash": "316ebb797d5196020eee013cfe771671fff4da8859adc9f385f52a74e82f4e55", // 文件哈希，可以用文件名中的md5替代 √
    "function_number": 27 // 函数数量 √
 }
--- a/Genius3/python/PySide/QtCore.pyd
+++ b/Genius3/python/PySide/QtCore.pyd
--- a/Genius3/python/PySide/QtDeclarative.pyd
+++ b/Genius3/python/PySide/QtDeclarative.pyd
--- a/Genius3/python/PySide/QtGui.pyd
+++ b/Genius3/python/PySide/QtGui.pyd
--- a/Genius3/python/PySide/QtHelp.pyd
+++ b/Genius3/python/PySide/QtHelp.pyd
--- a/Genius3/python/PySide/QtMultimedia.pyd
+++ b/Genius3/python/PySide/QtMultimedia.pyd
--- a/Genius3/python/PySide/QtNetwork.pyd
+++ b/Genius3/python/PySide/QtNetwork.pyd
--- a/Genius3/python/PySide/QtOpenGL.pyd
+++ b/Genius3/python/PySide/QtOpenGL.pyd
--- a/Genius3/python/PySide/QtScript.pyd
+++ b/Genius3/python/PySide/QtScript.pyd
--- a/Genius3/python/PySide/QtScriptTools.pyd
+++ b/Genius3/python/PySide/QtScriptTools.pyd
--- a/Genius3/python/PySide/QtSql.pyd
+++ b/Genius3/python/PySide/QtSql.pyd
--- a/Genius3/python/PySide/QtSvg.pyd
+++ b/Genius3/python/PySide/QtSvg.pyd
--- a/Genius3/python/PySide/QtTest.pyd
+++ b/Genius3/python/PySide/QtTest.pyd
--- a/Genius3/python/PySide/QtUiTools.pyd
+++ b/Genius3/python/PySide/QtUiTools.pyd
--- a/Genius3/python/PySide/QtXml.pyd
+++ b/Genius3/python/PySide/QtXml.pyd
--- a/Genius3/python/PySide/QtXmlPatterns.pyd
+++ b/Genius3/python/PySide/QtXmlPatterns.pyd
--- a/Genius3/python/PySide/init.py
+++ b/Genius3/python/PySide/init.py
@ -0,0 +1,3 @@
 __all__ = ['QtCore', 'QtGui', 'QtNetwork', 'QtOpenGL', 'QtSql', 'QtSvg', 'QtTest', 'QtWebKit', 'QtScript']
 __version__         = "1.1.2"
 __version_info__    = (1, 1, 2, "final", 1)
--- a/Genius3/python/PySide/phonon.pyd
+++ b/Genius3/python/PySide/phonon.pyd
--- a/Genius3/python/PySide/pyside-python2.7.dll
+++ b/Genius3/python/PySide/pyside-python2.7.dll
--- a/Genius3/python/PySide/shiboken-python2.7.dll
+++ b/Genius3/python/PySide/shiboken-python2.7.dll
--- a/Genius3/python/idaapi.py
+++ b/Genius3/python/idaapi.py
--- a/Genius3/python/idaapi.pyc
+++ b/Genius3/python/idaapi.pyc
--- a/Genius3/python/idautils.py
+++ b/Genius3/python/idautils.py
@ -0,0 +1,830 @@
 #---------------------------------------------------------------------
 # IDAPython - Python plugin for Interactive Disassembler
 #
 # Copyright (c) 2004-2010 Gergely Erdelyi <gergely.erdelyi@d-dome.net>
 #
 # All rights reserved.
 #
 # For detailed copyright information see the file COPYING in
 # the root of the distribution archive.
 #---------------------------------------------------------------------
 """
 idautils.py - High level utility functions for IDA
 """
 import idaapi
 import idc
 import types
 import os
 def refs(ea, funcfirst, funcnext):
    """
    Generic reference collector - INTERNAL USE ONLY.
    """
    ref = funcfirst(ea)
    while ref != idaapi.BADADDR:
        yield ref
        ref = funcnext(ea, ref)
 def CodeRefsTo(ea, flow):
    """
    Get a list of code references to 'ea'
    @param ea:   Target address
    @param flow: Follow normal code flow or not
    @type  flow: Boolean (0/1, False/True)
    @return: list of references (may be empty list)
    Example::
        for ref in CodeRefsTo(ScreenEA(), 1):
            print ref
    """
    if flow == 1:
        return refs(ea, idaapi.get_first_cref_to, idaapi.get_next_cref_to)
    else:
        return refs(ea, idaapi.get_first_fcref_to, idaapi.get_next_fcref_to)
 def CodeRefsFrom(ea, flow):
    """
    Get a list of code references from 'ea'
    @param ea:   Target address
    @param flow: Follow normal code flow or not
    @type  flow: Boolean (0/1, False/True)
    @return: list of references (may be empty list)
    Example::
        for ref in CodeRefsFrom(ScreenEA(), 1):
            print ref
    """
    if flow == 1:
        return refs(ea, idaapi.get_first_cref_from, idaapi.get_next_cref_from)
    else:
        return refs(ea, idaapi.get_first_fcref_from, idaapi.get_next_fcref_from)
 def DataRefsTo(ea):
    """
    Get a list of data references to 'ea'
    @param ea:   Target address
    @return: list of references (may be empty list)
    Example::
        for ref in DataRefsTo(ScreenEA()):
            print ref
    """
    return refs(ea, idaapi.get_first_dref_to, idaapi.get_next_dref_to)
 def DataRefsFrom(ea):
    """
    Get a list of data references from 'ea'
    @param ea:   Target address
    @return: list of references (may be empty list)
    Example::
        for ref in DataRefsFrom(ScreenEA()):
            print ref
    """
    return refs(ea, idaapi.get_first_dref_from, idaapi.get_next_dref_from)
 def XrefTypeName(typecode):
    """
    Convert cross-reference type codes to readable names
    @param typecode: cross-reference type code
    """
    ref_types = {
        0  : 'Data_Unknown',
        1  : 'Data_Offset',
        2  : 'Data_Write',
        3  : 'Data_Read',
        4  : 'Data_Text',
        5  : 'Data_Informational',
        16 : 'Code_Far_Call',
        17 : 'Code_Near_Call',
        18 : 'Code_Far_Jump',
        19 : 'Code_Near_Jump',
        20 : 'Code_User',
        21 : 'Ordinary_Flow'
        }
    assert typecode in ref_types, "unknown reference type %d" % typecode
    return ref_types[typecode]
 def _copy_xref(xref):
    """ Make a private copy of the xref class to preserve its contents """
    class _xref(object):
        pass
    xr = _xref()
    for attr in [ 'frm', 'to', 'iscode', 'type', 'user' ]:
        setattr(xr, attr, getattr(xref, attr))
    return xr
 def XrefsFrom(ea, flags=0):
    """
    Return all references from address 'ea'
    @param ea: Reference address
    @param flags: any of idaapi.XREF_* flags
    Example::
           for xref in XrefsFrom(here(), 0):
               print xref.type, XrefTypeName(xref.type), \
                         'from', hex(xref.frm), 'to', hex(xref.to)
    """
    xref = idaapi.xrefblk_t()
    if xref.first_from(ea, flags):
        yield _copy_xref(xref)
        while xref.next_from():
            yield _copy_xref(xref)
 def XrefsTo(ea, flags=0):
    """
    Return all references to address 'ea'
    @param ea: Reference address
    @param flags: any of idaapi.XREF_* flags
    Example::
           for xref in XrefsTo(here(), 0):
               print xref.type, XrefTypeName(xref.type), \
                         'from', hex(xref.frm), 'to', hex(xref.to)
    """
    xref = idaapi.xrefblk_t()
    if xref.first_to(ea, flags):
        yield _copy_xref(xref)
        while xref.next_to():
            yield _copy_xref(xref)
 def Threads():
    """Returns all thread IDs"""
    for i in xrange(0, idc.GetThreadQty()):
        yield idc.GetThreadId(i)
 def Heads(start=None, end=None):
    """
    Get a list of heads (instructions or data)
    @param start: start address (default: inf.minEA)
    @param end:   end address (default: inf.maxEA)
    @return: list of heads between start and end
    """
    if not start: start = idaapi.cvar.inf.minEA
    if not end:   end = idaapi.cvar.inf.maxEA
    ea = start
    if not idc.isHead(idc.GetFlags(ea)):
        ea = idaapi.next_head(ea, end)
    while ea != idaapi.BADADDR:
        yield ea
        ea = idaapi.next_head(ea, end)
 def Functions(start=None, end=None):
    """
    Get a list of functions
    @param start: start address (default: inf.minEA)
    @param end:   end address (default: inf.maxEA)
    @return: list of heads between start and end
    @note: The last function that starts before 'end' is included even
    if it extends beyond 'end'. Any function that has its chunks scattered
    in multiple segments will be reported multiple times, once in each segment
    as they are listed.
    """
    if not start: start = idaapi.cvar.inf.minEA
    if not end:   end = idaapi.cvar.inf.maxEA
    # find first function head chunk in the range
    chunk = idaapi.get_fchunk(start)
    if not chunk:
        chunk = idaapi.get_next_fchunk(start)
    while chunk and chunk.startEA < end and (chunk.flags & idaapi.FUNC_TAIL) != 0:
        chunk = idaapi.get_next_fchunk(chunk.startEA)
    func = chunk
    while func and func.startEA < end:
        startea = func.startEA
        yield startea
        func = idaapi.get_next_func(startea)
 def Chunks(start):
    """
    Get a list of function chunks
    @param start: address of the function
    @return: list of funcion chunks (tuples of the form (start_ea, end_ea))
             belonging to the function
    """
    func_iter = idaapi.func_tail_iterator_t( idaapi.get_func( start ) )
    status = func_iter.main()
    while status:
        chunk = func_iter.chunk()
        yield (chunk.startEA, chunk.endEA)
        status = func_iter.next()
 def Modules():
    """
    Returns a list of module objects with name,size,base and the rebase_to attributes
    """
    mod = idaapi.module_info_t()
    result = idaapi.get_first_module(mod)
    while result:
        yield idaapi.object_t(name=mod.name, size=mod.size, base=mod.base, rebase_to=mod.rebase_to)
        result = idaapi.get_next_module(mod)
 def Names():
    """
    Returns a list of names
    @return: List of tuples (ea, name)
    """
    for i in xrange(idaapi.get_nlist_size()):
        ea   = idaapi.get_nlist_ea(i)
        name = idaapi.get_nlist_name(i)
        yield (ea, name)
 def Segments():
    """
    Get list of segments (sections) in the binary image
    @return: List of segment start addresses.
    """
    for n in xrange(idaapi.get_segm_qty()):
        seg = idaapi.getnseg(n)
        if seg:
            yield seg.startEA
 def Entries():
    """
    Returns a list of entry points
    @return: List of tuples (index, ordinal, ea, name)
    """
    n = idaapi.get_entry_qty()
    for i in xrange(0, n):
        ordinal = idaapi.get_entry_ordinal(i)
        ea      = idaapi.get_entry(ordinal)
        name    = idaapi.get_entry_name(ordinal)
        yield (i, ordinal, ea, name)
 def FuncItems(start):
    """
    Get a list of function items
    @param start: address of the function
    @return: ea of each item in the function
    """
    func = idaapi.get_func(start)
    if not func:
        return
    fii = idaapi.func_item_iterator_t()
    ok = fii.set(func)
    while ok:
        yield fii.current()
        ok = fii.next_code()
 def Structs():
    """
    Get a list of structures
    @return: List of tuples (idx, sid, name)
    """
    idx  = idc.GetFirstStrucIdx()
    while idx != idaapi.BADADDR:
        sid = idc.GetStrucId(idx)
        yield (idx, sid, idc.GetStrucName(sid))
        idx = idc.GetNextStrucIdx(idx)
 def StructMembers(sid):
    """
    Get a list of structure members information (or stack vars if given a frame).
    @param sid: ID of the structure.
    @return: List of tuples (offset, name, size)
    @note: If 'sid' does not refer to a valid structure,
           an exception will be raised.
    @note: This will not return 'holes' in structures/stack frames;
           it only returns defined structure members.
    """
    m = idc.GetFirstMember(sid)
    if m == -1:
        raise Exception("No structure with ID: 0x%x" % sid)
    while (m != idaapi.BADADDR):
        name = idc.GetMemberName(sid, m)
        if name:
            yield (m, name, idc.GetMemberSize(sid, m))
        m = idc.GetStrucNextOff(sid, m)
 def DecodePrecedingInstruction(ea):
    """
    Decode preceding instruction in the execution flow.
    @param ea: address to decode
    @return: (None or the decode instruction, farref)
             farref will contain 'true' if followed an xref, false otherwise
    """
    prev_addr, farref  = idaapi.decode_preceding_insn(ea)
    if prev_addr == idaapi.BADADDR:
        return (None, False)
    else:
        return (idaapi.cmd.copy(), farref)
 def DecodePreviousInstruction(ea):
    """
    Decodes the previous instruction and returns an insn_t like class
    @param ea: address to decode
    @return: None or a new insn_t instance
    """
    prev_addr = idaapi.decode_prev_insn(ea)
    if prev_addr == idaapi.BADADDR:
        return None
    return idaapi.cmd.copy()
 def DecodeInstruction(ea):
    """
    Decodes an instruction and returns an insn_t like class
    @param ea: address to decode
    @return: None or a new insn_t instance
    """
    inslen = idaapi.decode_insn(ea)
    if inslen == 0:
        return None
    return idaapi.cmd.copy()
 def GetDataList(ea, count, itemsize=1):
    """
    Get data list - INTERNAL USE ONLY
    """
    if itemsize == 1:
        getdata = idaapi.get_byte
    elif itemsize == 2:
        getdata = idaapi.get_word
    elif itemsize == 4:
        getdata = idaapi.get_long
    elif itemsize == 8:
        getdata = idaapi.get_qword
    else:
        raise ValueError, "Invalid data size! Must be 1, 2, 4 or 8"
    endea = ea + itemsize * count
    curea = ea
    while curea < endea:
        yield getdata(curea)
        curea += itemsize
 def PutDataList(ea, datalist, itemsize=1):
    """
    Put data list - INTERNAL USE ONLY
    """
    putdata = None
    if itemsize == 1:
        putdata = idaapi.patch_byte
    if itemsize == 2:
        putdata = idaapi.patch_word
    if itemsize == 4:
        putdata = idaapi.patch_long
    assert putdata, "Invalid data size! Must be 1, 2 or 4"
    for val in datalist:
        putdata(ea, val)
        ea = ea + itemsize
 def MapDataList(ea, length, func, wordsize=1):
    """
    Map through a list of data words in the database
    @param ea:       start address
    @param length:   number of words to map
    @param func:     mapping function
    @param wordsize: size of words to map [default: 1 byte]
    @return: None
    """
    PutDataList(ea, map(func, GetDataList(ea, length, wordsize)), wordsize)
 def GetInputFileMD5():
    """
    Return the MD5 hash of the input binary file
    @return: MD5 string or None on error
    """
    return idc.GetInputMD5()
 class Strings(object):
    """
    Allows iterating over the string list. The set of strings will not be modified.
    , unless asked explicitly at setup()-time..
    Example:
        s = Strings()
        for i in s:
            print "%x: len=%d type=%d -> '%s'" % (i.ea, i.length, i.type, str(i))
    """
    class StringItem(object):
        """
        Class representing each string item.
        """
        def __init__(self, si):
            self.ea     = si.ea
            """String ea"""
            self.type   = si.type
            """string type (ASCSTR_xxxxx)"""
            self.length = si.length
            """string length"""
        def is_1_byte_encoding(self):
            return not self.is_2_bytes_encoding() and not self.is_4_bytes_encoding()
        def is_2_bytes_encoding(self):
            return (self.type & 7) in [idaapi.ASCSTR_UTF16, idaapi.ASCSTR_ULEN2, idaapi.ASCSTR_ULEN4]
        def is_4_bytes_encoding(self):
            return (self.type & 7) == idaapi.ASCSTR_UTF32
        def _toseq(self, as_unicode):
            if self.is_2_bytes_encoding():
                conv = idaapi.ACFOPT_UTF16
                pyenc = "utf-16"
            elif self.is_4_bytes_encoding():
                conv = idaapi.ACFOPT_UTF8
                pyenc = "utf-8"
            else:
                conv = idaapi.ACFOPT_ASCII
                pyenc = 'ascii'
            strbytes = idaapi.get_ascii_contents2(self.ea, self.length, self.type, conv)
            return unicode(strbytes, pyenc, 'replace') if as_unicode else strbytes
        def __str__(self):
            return self._toseq(False)
        def __unicode__(self):
            return self._toseq(True)
    STR_C       = 0x0001
    """C-style ASCII string"""
    STR_PASCAL  = 0x0002
    """Pascal-style ASCII string (length byte)"""
    STR_LEN2    = 0x0004
    """Pascal-style, length is 2 bytes"""
    STR_UNICODE = 0x0008
    """Unicode string"""
    STR_LEN4    = 0x0010
    """Pascal-style, length is 4 bytes"""
    STR_ULEN2   = 0x0020
    """Pascal-style Unicode, length is 2 bytes"""
    STR_ULEN4   = 0x0040
    """Pascal-style Unicode, length is 4 bytes"""
    def clear_cache(self):
        """Clears the strings list cache"""
        self.refresh(0, 0) # when ea1=ea2 the kernel will clear the cache
    def __init__(self, default_setup = False):
        """
        Initializes the Strings enumeration helper class
        @param default_setup: Set to True to use default setup (C strings, min len 5, ...)
        """
        self.size = 0
        if default_setup:
            self.setup()
        else:
            self.refresh()
        self._si  = idaapi.string_info_t()
    def refresh(self, ea1=None, ea2=None):
        """Refreshes the strings list"""
        if ea1 is None:
            ea1 = idaapi.cvar.inf.minEA
        if ea2 is None:
            ea2 = idaapi.cvar.inf.maxEA
        idaapi.refresh_strlist(ea1, ea2)
        self.size = idaapi.get_strlist_qty()
    def setup(self,
              strtypes = STR_C,
              minlen = 5,
              only_7bit = True,
              ignore_instructions = False,
              ea1 = None,
              ea2 = None,
              display_only_existing_strings = False):
        if ea1 is None:
            ea1 = idaapi.cvar.inf.minEA
        if ea2 is None:
            ea2 = idaapi.cvar.inf.maxEA
        t = idaapi.strwinsetup_t()
        t.strtypes = strtypes
        t.minlen = minlen
        t.only_7bit = only_7bit
        t.ea1 = ea1
        t.ea2 = ea2
        t.display_only_existing_strings = display_only_existing_strings
        idaapi.set_strlist_options(t)
        # Automatically refreshes
        self.refresh()
    def _get_item(self, index):
        if not idaapi.get_strlist_item(index, self._si):
            return None
        else:
            return Strings.StringItem(self._si)
    def __iter__(self):
        return (self._get_item(index) for index in xrange(0, self.size))
    def __getitem__(self, index):
        """Returns a string item or None"""
        if index >= self.size:
            raise KeyError
        else:
            return self._get_item(index)
 # -----------------------------------------------------------------------
 def GetIdbDir():
    """
    Get IDB directory
    This function returns directory path of the current IDB database
    """
    return os.path.dirname(idaapi.cvar.database_idb) + os.sep
 # -----------------------------------------------------------------------
 def GetRegisterList():
    """Returns the register list"""
    return idaapi.ph_get_regnames()
 # -----------------------------------------------------------------------
 def GetInstructionList():
    """Returns the instruction list of the current processor module"""
    return [i[0] for i in idaapi.ph_get_instruc() if i[0]]
 # -----------------------------------------------------------------------
 def _Assemble(ea, line):
    """
    Please refer to Assemble() - INTERNAL USE ONLY
    """
    if type(line) == types.StringType:
        lines = [line]
    else:
        lines = line
    ret = []
    for line in lines:
        seg = idaapi.getseg(ea)
        if not seg:
            return (False, "No segment at ea")
        ip  = ea - (idaapi.ask_selector(seg.sel) << 4)
        buf = idaapi.AssembleLine(ea, seg.sel, ip, seg.bitness, line)
        if not buf:
            return (False, "Assembler failed: " + line)
        ea += len(buf)
        ret.append(buf)
    if len(ret) == 1:
        ret = ret[0]
    return (True, ret)
 def Assemble(ea, line):
    """
    Assembles one or more lines (does not display an message dialogs)
    If line is a list then this function will attempt to assemble all the lines
    This function will turn on batch mode temporarily so that no messages are displayed on the screen
    @param ea:       start address
    @return: (False, "Error message") or (True, asm_buf) or (True, [asm_buf1, asm_buf2, asm_buf3])
    """
    old_batch = idc.Batch(1)
    ret = _Assemble(ea, line)
    idc.Batch(old_batch)
    return ret
 def _copy_obj(src, dest, skip_list = None):
    """
    Copy non private/non callable attributes from a class instance to another
    @param src: Source class to copy from
    @param dest: If it is a string then it designates the new class type that will be created and copied to.
                 Otherwise dest should be an instance of another class
    @return: A new instance or "dest"
    """
    if type(dest) == types.StringType:
        # instantiate a new destination class of the specified type name?
        dest = new.classobj(dest, (), {})
    for x in dir(src):
        # skip special and private fields
        if x.startswith("__") and x.endswith("__"):
            continue
        # skip items in the skip list
        if skip_list and x in skip_list:
            continue
        t = getattr(src, x)
        # skip callable
        if callable(t):
            continue
        setattr(dest, x, t)
    return dest
 # -----------------------------------------------------------------------
 class _reg_dtyp_t(object):
    """
    INTERNAL
    This class describes a register's number and dtyp.
    The equal operator is overloaded so that two instances can be tested for equality
    """
    def __init__(self, reg, dtyp):
        self.reg  = reg
        self.dtyp = dtyp
    def __eq__(self, other):
        return (self.reg == other.reg) and (self.dtyp == other.dtyp)
 # -----------------------------------------------------------------------
 class _procregs(object):
    """Utility class allowing the users to identify registers in a decoded instruction"""
    def __getattr__(self, attr):
        ri = idaapi.reg_info_t()
        if not idaapi.parse_reg_name(attr, ri):
            raise AttributeError()
        r = _reg_dtyp_t(ri.reg, ord(idaapi.get_dtyp_by_size(ri.size)))
        self.__dict__[attr] = r
        return r
    def __setattr__(self, attr, value):
        raise AttributeError(attr)
 # -----------------------------------------------------------------------
 class _cpu(object):
    "Simple wrapper around GetRegValue/SetRegValue"
    def __getattr__(self, name):
        #print "cpu.get(%s)" % name
        return idc.GetRegValue(name)
    def __setattr__(self, name, value):
        #print "cpu.set(%s)" % name
        return idc.SetRegValue(value, name)
 # --------------------------------------------------------------------------
 class __process_ui_actions_helper(object):
    def __init__(self, actions, flags = 0):
        """Expect a list or a string with a list of actions"""
        if isinstance(actions, str):
            lst = actions.split(";")
        elif isinstance(actions, (list, tuple)):
            lst = actions
        else:
            raise ValueError, "Must pass a string, list or a tuple"
        # Remember the action list and the flags
        self.__action_list = lst
        self.__flags = flags
        # Reset action index
        self.__idx = 0
    def __len__(self):
        return len(self.__action_list)
    def __call__(self):
        if self.__idx >= len(self.__action_list):
            return False
        # Execute one action
        idaapi.process_ui_action(
                self.__action_list[self.__idx],
                self.__flags)
        # Move to next action
        self.__idx += 1
        # Reschedule
        return True
 # --------------------------------------------------------------------------
 def ProcessUiActions(actions, flags=0):
    """
    @param actions: A string containing a list of actions separated by semicolon, a list or a tuple
    @param flags: flags to be passed to process_ui_action()
    @return: Boolean. Returns False if the action list was empty or execute_ui_requests() failed.
    """
    # Instantiate a helper
    helper = __process_ui_actions_helper(actions, flags)
    return False if len(helper) < 1 else idaapi.execute_ui_requests((helper,))
 # -----------------------------------------------------------------------
 class peutils_t(object):
    """
    PE utility class. Retrieves PE information from the database.
    Constants from pe.h
    """
    PE_NODE = "$ PE header" # netnode name for PE header
    PE_ALT_DBG_FPOS   = idaapi.BADADDR & -1 #  altval() -> translated fpos of debuginfo
    PE_ALT_IMAGEBASE  = idaapi.BADADDR & -2 #  altval() -> loading address (usually pe.imagebase)
    PE_ALT_PEHDR_OFF  = idaapi.BADADDR & -3 #  altval() -> offset of PE header
    PE_ALT_NEFLAGS    = idaapi.BADADDR & -4 #  altval() -> neflags
    PE_ALT_TDS_LOADED = idaapi.BADADDR & -5 #  altval() -> tds already loaded(1) or invalid(-1)
    PE_ALT_PSXDLL     = idaapi.BADADDR & -6 #  altval() -> if POSIX(x86) imports from PSXDLL netnode
    def __init__(self):
        self.__penode = idaapi.netnode()
        self.__penode.create(peutils_t.PE_NODE)
    imagebase = property(
        lambda self: self.__penode.altval(peutils_t.PE_ALT_IMAGEBASE)
      )
    header = property(
        lambda self: self.__penode.altval(peutils_t.PE_ALT_PEHDR_OFF)
      )
    def __str__(self):
        return "peutils_t(imagebase=%s, header=%s)" % (hex(self.imagebase), hex(self.header))
    def header(self):
        """
        Returns the complete PE header as an instance of peheader_t (defined in the SDK).
        """
        return self.__penode.valobj()
 # -----------------------------------------------------------------------
 cpu = _cpu()
 """This is a special class instance used to access the registers as if they were attributes of this object.
 For example to access the EAX register:
    print "%x" % cpu.Eax
 """
 procregs = _procregs()
 """This object is used to access the processor registers. It is useful when decoding instructions and you want to see which instruction is which.
 For example:
    x = idautils.DecodeInstruction(here())
    if x[0] == procregs.Esp:
        print "This operand is the register ESP
 """
--- a/Genius3/python/idautils.pyc
+++ b/Genius3/python/idautils.pyc
--- a/Genius3/python/idc.py
+++ b/Genius3/python/idc.py
--- a/Genius3/python/idc.pyc
+++ b/Genius3/python/idc.pyc
--- a/Genius3/python/init.py
+++ b/Genius3/python/init.py
@ -0,0 +1,111 @@
 #!/usr/bin/env python
 # -----------------------------------------------------------------------
 # IDAPython - Python plugin for Interactive Disassembler
 #
 # Copyright (c) The IDAPython Team <idapython@googlegroups.com>
 #
 # All rights reserved.
 #
 # For detailed copyright information see the file COPYING in
 # the root of the distribution archive.
 # -----------------------------------------------------------------------
 # init.py - Essential init routines
 # -----------------------------------------------------------------------
 import os
 import sys
 import time
 import warnings
 import _idaapi
 # __EA64__ is set if IDA is running in 64-bit mode
 __EA64__ = _idaapi.BADADDR == 0xFFFFFFFFFFFFFFFFL
 # -----------------------------------------------------------------------
 # Take over the standard text outputs
 # -----------------------------------------------------------------------
 class IDAPythonStdOut:
    """
    Dummy file-like class that receives stout and stderr
    """
    def write(self, text):
        # NB: in case 'text' is Unicode, msg() will decode it
        # and call umsg() to print it
        _idaapi.msg(text)
    def flush(self):
        pass
    def isatty(self):
        return False
 # -----------------------------------------------------------------------
 def runscript(script):
    """
    Executes a script.
    This function is present for backward compatiblity. Please use idaapi.IDAPython_ExecScript() instead
    @param script: script path
    @return: Error string or None on success
    """
    import idaapi
    return idaapi.IDAPython_ExecScript(script, globals())
 # -----------------------------------------------------------------------
 def print_banner():
    banner = [
      "Python %s " % sys.version,
      "IDAPython" + (" 64-bit" if __EA64__ else "") + " v%d.%d.%d %s (serial %d) (c) The IDAPython Team <idapython@googlegroups.com>" % IDAPYTHON_VERSION
    ]
    sepline = '-' * (max([len(s) for s in banner])+1)
    print(sepline)
    print("\n".join(banner))
    print(sepline)
 # -----------------------------------------------------------------------
 # Redirect stderr and stdout to the IDA message window
 _orig_stdout = sys.stdout;
 _orig_stderr = sys.stderr;
 sys.stdout = sys.stderr = IDAPythonStdOut()
 # -----------------------------------------------------------------------
 # Initialize the help, with our own stdin wrapper, that'll query the user
 # -----------------------------------------------------------------------
 import pydoc
 class IDAPythonHelpPrompter:
    def readline(self):
        return idaapi.askstr(0, '', 'Help topic?')
 help = pydoc.Helper(input = IDAPythonHelpPrompter(), output = sys.stdout)
 # Assign a default sys.argv
 sys.argv = [""]
 # Have to make sure Python finds our modules
 sys.path.append(_idaapi.idadir("python"))
 # Remove current directory from the top of the patch search
 if '' in sys.path: # On non Windows, the empty path is added
    sys.path.remove('')
 if os.getcwd() in sys.path:
    sys.path.remove(os.getcwd())
 # ...and add it to the end if needed
 if not IDAPYTHON_REMOVE_CWD_SYS_PATH:
    sys.path.append(os.getcwd())
 # Import all the required modules
 from idaapi import Choose, get_user_idadir, cvar, Choose2, Appcall, Form
 from idc      import *
 from idautils import *
 import idaapi
 # Load the users personal init file
 userrc = os.path.join(get_user_idadir(), "idapythonrc.py")
 if os.path.exists(userrc):
    idaapi.IDAPython_ExecScript(userrc, globals())
 # All done, ready to rock.
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
@ -1,81 +0,0 @@
 class HierarchicalGraphNeuralNetwork(nn.Module):
    def __init__(self, external_vocab: Vocab):
        super(HierarchicalGraphNeuralNetwork, self).__init__()
        self.pool = 'global_max_pool'
        # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
        cfg_filter_list =[200, 200]
        cfg_filter_list.insert(0, 11)
        self.cfg_filter_length = len(cfg_filter_list)
        cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
                                i in range(self.cfg_filter_length - 1)]
        cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
        cfg_constructor = cfg_conv['constructor']
        for i in range(self.cfg_filter_length - 1):
            setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
        self.dropout = nn.Dropout(p=0.2)
        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
        self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
                                                     embedding_dim=cfg_filter_list[-1],
                                                     padding_idx=external_vocab.pad_idx)
        fcg_filter_list = [200, 200]
        fcg_filter_list.insert(0, cfg_filter_list[-1])
        self.fcg_filter_length = len(fcg_filter_list)
        fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
                                i in range(self.fcg_filter_length - 1)]
        fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
        fcg_constructor = fcg_conv['constructor']
        for i in range(self.fcg_filter_length - 1):
            setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
        # Last Projection Function: gradually project with more linear layers
        self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
        self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
        self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
        self.last_activation = nn.Softmax(dim=1)
    def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
                bt_all_function_edges: list):
        rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
        x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
        fcg_list = []
        fcg_internal_list = []
        for idx_batch in range(len(real_bt_positions) - 1):
            start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
            idx_x_cfg = x_cfg_pool[start_pos: end_pos]
            fcg_internal_list.append(idx_x_cfg)
            idx_x_external = self.external_embedding_layer(
                torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
            idx_x_external = idx_x_external.squeeze(dim=0)
            idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
            idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
            idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
            idx_graph_data.validate()
            fcg_list.append(idx_graph_data)
        fcg_batch = Batch.from_data_list(fcg_list)
        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
        rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch)  # [batch_size, max_node_size, dim]
        x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
        batch_final = x_fcg_pool
        # step last project to the number_of_classes (multiclass)
        bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
        bt_pred = self.last_activation(bt_final_embed)
        return bt_pred
    def forward_cfg_gnn(self, local_batch: Batch):
        in_x, edge_index = local_batch.x, local_batch.edge_index
        for i in range(self.cfg_filter_length - 1):
            out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
            out_x = torch.nn.functional.relu(out_x, inplace=True)
            out_x = self.dropout(out_x)
            in_x = out_x
        local_batch.x = in_x
        return local_batch
    def forward_fcg_gnn(self, function_batch: Batch):
        in_x, edge_index = function_batch.x, function_batch.edge_index
        for i in range(self.fcg_filter_length - 1):
            out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
            out_x = torch.nn.functional.relu(out_x, inplace=True)
            out_x = self.dropout(out_x)
            in_x = out_x
        function_batch.x = in_x
        return function_batch
--- a/Genius3/raw-feature-extractor/cfg_constructor.py
+++ b/Genius3/raw-feature-extractor/cfg_constructor.py
@ -1,3 +1,9 @@
 import copy
 import networkx as nx
 from idautils import *
 from idaapi import *
 from idc import *
 import copy
 import networkx as nx
 from idautils import *
@ -93,11 +99,11 @@ def filtering(cfg):
 		bb_start = bb[0]
 		bb_end = bb[1]
 		re = remove(bb_start, bb_end)
-		print(bb_id, re, bb_start, bb_end)
+		print bb_id, re, bb_start, bb_end
 		if re:
-			print(re, bb_id)
+			print re, bb_id
 			rm_sets.append(bb_id)
-	print(rm_sets)
+	print rm_sets
 	for bb_id in rm_sets:
 		cfg.remove_node(bb_id)
@ -154,16 +160,16 @@ def attributingRe(cfg, externs_eas, ea_externs):
 def attributing(cfg):
 	ga = graph_analysis()
 	ga.gwithoffspring(cfg)
-	print("finishing offspring")
+	print "finishing offspring"
 	for node in cfg:
 		stmt_num = getStmtNum(node)
 		binary_value = getBinaryValue(node)
 		cfg.node[node]['stmt_num'] = stmt_num
 		cfg.node[node]['binary_value'] = binary_value
 	ga.domChecking(cfg)
-	print("finishing domChecking")
+	print "finishing domChecking"
 	ga.loopChecking(cfg)
-	print("finishing loopChecking")
+	print "finishing loopChecking"
 def getStmtNum(node):
@ -184,17 +190,17 @@ def getBinaryValue(node):
 	for x in xrange((inst_addr - start)-1):
 		addr = start + x
 		y = GetOriginalByte(addr)
-		print(value, addr, y)
+		print value, addr, y
 		value = value | y
 		value = value << 8
-		print(value)
+		print value
 	addr = inst_addr - 1
 	y = GetOriginalByte(addr)
-	print(value, addr, y)
+	print value, addr, y
 	value = value | y
-	print(node)
+	print node
-	print(bin(value))
+	print bin(value)
 	return value
--- a/Genius3/raw-feature-extractor/cfg_constructor.pyc
+++ b/Genius3/raw-feature-extractor/cfg_constructor.pyc
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -1,236 +0,0 @@
 # coding=utf-8
 import pickle as pk
 import re
 import json
 import os
 from tqdm import tqdm
 def convert(start, end, overhaul):
    for workflow in range(start, end):
        # workflow = 0
        cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
        output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
        dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
        log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
        process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
        if overhaul:
            if os.path.exists(log_path):
                os.remove(log_path)
            if os.path.exists(process_log_path):
                os.remove(process_log_path)
        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
            logged = log.readline()
            if logged == '':
                log_index = 0
            else:
                log_index = int(logged)
            for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
                if index < log_index:
                    continue
                name = cfg[:-4]  # 纯文件名，不带后缀
                cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
                try:
                    data = pk.load(cfg_file)
                except EOFError:
                    process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
                    continue
                except ValueError:
                    process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
                    continue
                finally:
                    cfg_file.close()
                dot_file_path = os.path.join(dot_dir, name + '.dot')
                if not os.path.exists(dot_file_path):
                    process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
                else:
                    # 打开dot文件获取fcg
                    raw_function_edges = []
                    # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
                    functions_list = []
                    with open(dot_file_path, 'r') as dot:
                        for line in dot:
                            if '->' in line:
                                raw_function_edges.append(re.findall(r'\b\d+\b', line))
                            elif 'label' in line:
                                functions_list.append(line[line.find('= "') + 3:line.find('",')])
                    # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
                    if raw_function_edges.__len__() == 0:
                        continue
                    # 为当前pe文件创建json对象
                    json_obj = {
                        'hash': data.binary_name[11:],
                        # 2023.8.12 bug fix: 这里获取的是内部函数的数量
                        # 'function_number': data.raw_graph_list.__len__(),
                        'function_number': len(functions_list),
                        'function_edges': [[int(d[0]) for d in raw_function_edges],
                                           [int(d[1]) for d in raw_function_edges]],
                        'acfg_list': [],
                        'function_names': functions_list
                    }
                    # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
                    # 读取pkl文件，一个acfg由一个函数分解而来
                    for acfg in data.raw_graph_list:
                        # 函数为外部函数，不需要构建cfg
                        if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
                            continue
                        # 这里2是因为Genius框架提取特征时将后代数量放在2
                        offspring = [d.get('v')[2] for d in acfg.g.node.values()]
                        # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
                        # 以框架为主，将bb_features数组削减为和g.node长度一致
                        diff = acfg.g.__len__() - len(acfg.bb_features)
                        if diff != 0:
                            del acfg.bb_features[diff:]
                        # 将后代数量的特征放入bb_features中
                        for i, offs in enumerate(offspring):
                            acfg.bb_features[i].append(offs)
                        acfg_item = {
                            'block_number': acfg.g.__len__(),
                            'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
                            'block_features': acfg.bb_features
                        }
                        json_obj['acfg_list'].append(acfg_item)
                        # json_obj['function_names'].append(acfg.funcname)
                    # 将结果写入json本地文件
                    result = json.dumps(json_obj, ensure_ascii=False)
                    with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
                        out.write(result)
                    log.truncate(0)
                    log.seek(0)
                    log.write(str(index))
                    log.flush()
                    process_log.write("index {}, {} process done.\n".format(index, cfg))
 def convert_benign(overhaul):
    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
    dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
    output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
    log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
    process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
    if overhaul:
        if os.path.exists(log_path):
            os.remove(log_path)
        if os.path.exists(process_log_path):
            os.remove(process_log_path)
    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
        logged = log.readline()
        if logged == '':
            log_index = 0
        else:
            log_index = int(logged)
        cdg_list = os.listdir(cfg_dir)
        for index, cfg in enumerate(tqdm(cdg_list)):
            if index < log_index:
                continue
            name = cfg[:-4]  # 纯文件名
            cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
            try:
                data = pk.load(cfg_file)
            except EOFError:
                process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
                continue
            except ValueError:
                process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
                continue
            except KeyError:
                process_log.write("index {}, {} process failed. KeyError occurred.\n".format(index, cfg))
            finally:
                cfg_file.close()
            dot_file_path = os.path.join(dot_dir, name + '.dot')
            if not os.path.exists(dot_file_path):
                process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
            else:
                # 打开dot文件获取fcg
                raw_function_edges = []
                # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
                functions_list = []
                with open(dot_file_path, 'r') as dot:
                    for line in dot:
                        if '->' in line:
                            raw_function_edges.append(re.findall(r'\b\d+\b', line))
                        elif 'label' in line:
                            functions_list.append(line[line.find('= "') + 3:line.find('",')])
                # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
                if raw_function_edges.__len__() == 0:
                    continue
                # 为当前pe文件创建json对象
                json_obj = {
                    'hash': data.binary_name[11:],
                    # 2023.8.12 bug fix: 这里获取的是内部函数的数量
                    # 'function_number': data.raw_graph_list.__len__(),
                    'function_number': len(functions_list),
                    'function_edges': [[int(d[0]) for d in raw_function_edges],
                                       [int(d[1]) for d in raw_function_edges]],
                    'acfg_list': [],
                    'function_names': functions_list
                }
                # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
                # 读取pkl文件，一个acfg由一个函数分解而来
                for acfg in data.raw_graph_list:
                    # 函数为外部函数，不需要构建cfg
                    if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
                        continue
                    # 这里2是因为Genius框架提取特征时将后代数量放在2
                    offspring = [d.get('v')[2] for d in acfg.g.node.values()]
                    # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
                    # 以框架为主，将bb_features数组削减为和g.node长度一致
                    diff = acfg.g.__len__() - len(acfg.bb_features)
                    if diff != 0:
                        del acfg.bb_features[diff:]
                    # 将后代数量的特征放入bb_features中
                    for i, offs in enumerate(offspring):
                        acfg.bb_features[i].append(offs)
                    acfg_item = {
                        'block_number': acfg.g.__len__(),
                        'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
                        'block_features': acfg.bb_features
                    }
                    json_obj['acfg_list'].append(acfg_item)
                    # json_obj['function_names'].append(acfg.funcname)
                # 将结果写入json本地文件
                result = json.dumps(json_obj, ensure_ascii=False)
                with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
                    out.write(result)
                log.truncate(0)
                log.seek(0)
                log.write(str(index))
                log.flush()
                process_log.write("index {}, {} process done.\n".format(index, cfg))
 if __name__ == '__main__':
    # convert(35, 69)
    convert_benign(False)
--- a/Genius3/raw-feature-extractor/discovRe.py
+++ b/Genius3/raw-feature-extractor/discovRe.py
@ -1,4 +1,3 @@
 # coding=utf-8
 #
 # Reference Lister
 #
@ -7,164 +6,130 @@
 # Implemented with the idautils module
 #
 import networkx as nx
 import cPickle as pickle
 import pdb
 from graph_analysis_ida import *
 from graph_property import *
-
+#import wingdbstub
-
+#wingdbstub.Ensure()
 # import wingdbstub
 # wingdbstub.Ensure()
 def get_funcs(ea):
-    funcs = {}
+        funcs = {}
-    # Get current ea
+        # Get current ea
-    # Loop from start to end in the current segment
+        # Loop from start to end in the current segment
 	for funcea in Functions(SegStart(ea)):
 		funcname = GetFunctionName(funcea)
 		func = get_func(funcea)
 		blocks = FlowChart(func)
 		funcs[funcname] = []
 		for bl in blocks:
 		        start = bl.startEA
 		        end = bl.endEA
 		        funcs[funcname].append((start, end))
        return funcs
 def get_funcs_for_discoverRe(ea):
    features = {}
    for funcea in Functions(SegStart(ea)):
        funcname = GetFunctionName(funcea)
        print funcname
        func = get_func(funcea)
-        blocks = FlowChart(func)
+        feature = get_discoverRe_feature(func)
-        funcs[funcname] = []
+        features[funcname] = feature
-        for bl in blocks:
+    return features
            start = bl.startEA
            end = bl.endEA
            funcs[funcname].append((start, end))
    return funcs
 # 似乎是没用的函数
 # def get_funcs_for_discoverRe(ea):
 #     features = {}
 #     for funcea in Functions(SegStart(ea)):
 #         funcname = GetFunctionName(funcea)
 #         print(funcname)
 #         func = get_func(funcea)
 #         feature = get_discoverRe_feature(func)
 #         features[funcname] = feature
 #     return features
 # 获取所有bb的11维属性特征
 # 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
 def get_bb_features(func):
    bb_features = []
    blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
    for bl in blocks:
        calls = calCalls(bl)
        transferIns = calTransferIns(bl)
        mathematicsIns = calArithmeticIns(bl)
        logicIns = calLogicInstructions(bl)
        cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
        movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
        interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
        declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
        totalIns = calInsts(bl)
        consts = getBBconsts(bl)
        stringOrIntConsts = len(consts[0]) + len(consts[1])
        bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
                            interruptIns, declareIns, totalIns, stringOrIntConsts])
    return bb_features
 def get_discoverRe_feature(func, icfg):
    start = func.startEA
    end = func.endEA
    features = []
    FunctionCalls = getFuncCalls(func)
-    # 1
+    #1
    features.append(FunctionCalls)
    LogicInstr = getLogicInsts(func)
-    # 2
+    #2
    features.append(LogicInstr)
    Transfer = getTransferInsts(func)
-    # 3
+    #3
    features.append(Transfer)
    Locals = getLocalVariables(func)
-    # 4
+    #4
    features.append(Locals)
    BB = getBasicBlocks(func)
-    # 5
+    #5
    features.append(BB)
    Edges = len(icfg.edges())
-    # 6
+    #6
    features.append(Edges)
    Incoming = getIncommingCalls(func)
-    # 7
+    #7
    features.append(Incoming)
-    # 8
+    #8
    Instrs = getIntrs(func)
    features.append(Instrs)
    between = retrieveGP(icfg)
-    # 9
+    #9
    features.append(between)
    strings, consts = getfunc_consts(func)
    # 10
    features.append(strings)
    # 11
    features.append(consts)
    return features
 def get_func_names(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
+            funcname = GetFunctionName(funcea)
-        funcs[funcname] = funcea
+            funcs[funcname] = funcea
    return funcs
 def get_func_bases(ea):
-    funcs = {}
+        funcs = {}
-    for funcea in Functions(SegStart(ea)):
+        for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
+                funcname = GetFunctionName(funcea)
-        funcs[funcea] = funcname
+                funcs[funcea] = funcname
-    return funcs
+        return funcs
 def get_func_range(ea):
-    funcs = {}
+        funcs = {}
-    for funcea in Functions(SegStart(ea)):
+        for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
+                funcname = GetFunctionName(funcea)
-        func = get_func(funcea)
+		func = get_func(funcea)
-        funcs[funcname] = (func.startEA, func.endEA)
+                funcs[funcname] = (func.startEA, func.endEA)
-    return funcs
+        return funcs
 def get_func_sequences(ea):
-    funcs_bodylist = {}
+	funcs_bodylist = {}
-    funcs = get_funcs(ea)
+	funcs = get_funcs(ea)
-    for funcname in funcs:
+	for funcname in funcs:
-        if funcname not in funcs_bodylist:
+		if funcname not in funcs_bodylist:
-            funcs_bodylist[funcname] = []
+			funcs_bodylist[funcname] = []
-        for start, end in funcs[funcname]:
+		for start, end in funcs[funcname]:
-            inst_addr = start
+			inst_addr = start
-            while inst_addr <= end:
+			while inst_addr <= end:
-                opcode = GetMnem(inst_addr)
+				opcode = GetMnem(inst_addr)
-                funcs_bodylist[funcname].append(opcode)
+				funcs_bodylist[funcname].append(opcode)
-                inst_addr = NextHead(inst_addr)
+				inst_addr = NextHead(inst_addr)
-    return funcs_bodylist
+        return funcs_bodylist
 def get_func_cfgs(ea):
    func_cfglist = {}
    i = 0
    start, end = get_section('LOAD')
-    # print start, end
+    #print start, end
    for funcea in Functions(SegStart(ea)):
        if start <= funcea <= end:
            funcname = GetFunctionName(funcea)
            func = get_func(funcea)
-            print(i)
+            print i
            i += 1
            try:
                icfg = cfg.cfg_construct(func)
                func_cfglist[funcname] = icfg
            except:
                pass
-
+            
    return func_cfglist
 def get_section(t):
    base = SegByName(t)
    start = SegByBase(base)
@ -179,7 +144,7 @@ def get_func_cfg_sequences(func_cfglist):
        cfg = func_cfglist[funcname][0]
        for start, end in cfg:
            codesq = get_sequences(start, end)
-            func_cfg_seqlist[funcname][(start, end)] = codesq
+            func_cfg_seqlist[funcname][(start,end)] = codesq
    return func_cfg_seqlist
@ -193,30 +158,28 @@ def get_sequences(start, end):
        inst_addr = NextHead(inst_addr)
    return seq
 def get_stack_arg(func_addr):
-    print(func_addr)
+    print func_addr
    args = []
    stack = GetFrame(func_addr)
    if not stack:
-        return []
+            return []
    firstM = GetFirstMember(stack)
    lastM = GetLastMember(stack)
    i = firstM
-    while i <= lastM:
+    while i <=lastM:
-        mName = GetMemberName(stack, i)
+        mName = GetMemberName(stack,i)
-        mSize = GetMemberSize(stack, i)
+        mSize = GetMemberSize(stack,i)
        if mSize:
-            i = i + mSize
+                i = i + mSize
        else:
-            i = i + 4
+                i = i+4
        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
            args.append(mName)
    return args
-    # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
+        #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
-
+        
 def processDataSegs():
    funcdata = {}
    datafunc = {}
@ -232,7 +195,7 @@ def processDataSegs():
                refs = [v for v in DataRefsTo(cur)]
                for fea in refs:
                    name = GetFunctionName(fea)
-                    if len(name) == 0:
+                    if len(name)== 0:
                        continue
                    if name not in funcdata:
                        funcdata[name] = [cur]
@ -245,7 +208,6 @@ def processDataSegs():
                cur = NextHead(cur)
    return funcdata, datafunc
 def obtainDataRefs(callgraph):
    datarefs = {}
    funcdata, datafunc = processDataSegs()
@ -256,9 +218,11 @@ def obtainDataRefs(callgraph):
                refs = datafunc[dd]
                refs = list(set(refs))
                if node in datarefs:
-                    print(refs)
+                    print refs
                    datarefs[node] += refs
                    datarefs[node] = list(set(datarefs[node]))
                else:
                    datarefs[node] = refs
    return datarefs
--- a/Genius3/raw-feature-extractor/discovRe.pyc
+++ b/Genius3/raw-feature-extractor/discovRe.pyc
--- a/Genius3/raw-feature-extractor/func.py
+++ b/Genius3/raw-feature-extractor/func.py
@ -11,302 +11,283 @@ from idaapi import *
 from idc import *
 import networkx as nx
 import cfg_constructor as cfg
 import cPickle as pickle
 import pdb
 from raw_graphs import *
 #from discovRe_feature.discovRe import *
 from discovRe import *
 sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
 #import wingdbstub
 #wingdbstub.Ensure()
 def print_obj(obj):
-    # "打印对象的所有属性"
+    "打印对象的所有属性"
    print(obj.__dict__)
 def gt_funcNames(ea):
-    funcs = []
+	funcs = []
-    plt_func, plt_data = processpltSegs()
+	plt_func, plt_data = processpltSegs()
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-            funcname = get_unified_funcname(funcea)
+			funcname = get_unified_funcname(funcea)
-            if funcname in plt_func:
+			if funcname in plt_func:
-                print(funcname)
+				print funcname
-                continue
+				continue
-            funcs.append(funcname)
+			funcs.append(funcname)
-    return funcs
+	return funcs
 def get_funcs(ea):
-    funcs = {}
+	funcs = {}
-        # Get current ea
+		# Get current ea
-        # Loop from start to end in the current segment
+		# Loop from start to end in the current segment
-    plt_func, plt_data = processpltSegs()
+	plt_func, plt_data = processpltSegs()
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-        funcname = get_unified_funcname(funcea)
+		funcname = get_unified_funcname(funcea)
-        if funcname in plt_func:
+		if funcname in plt_func:
-            continue
+			continue
-        func = get_func(funcea)
+		func = get_func(funcea)
-        blocks = FlowChart(func)
+		blocks = FlowChart(func)
-        funcs[funcname] = []
+		funcs[funcname] = []
-        for bl in blocks:
+		for bl in blocks:
-                start = bl.startEA
+				start = bl.startEA
-                end = bl.endEA
+				end = bl.endEA
-                funcs[funcname].append((start, end))
+				funcs[funcname].append((start, end))
-    return funcs
+	return funcs
 # used for the callgraph generation.
 def get_func_namesWithoutE(ea):
-    funcs = {}
+	funcs = {}
-    plt_func, plt_data = processpltSegs()
+	plt_func, plt_data = processpltSegs()
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-            funcname = get_unified_funcname(funcea)
+			funcname = get_unified_funcname(funcea)
-            if 'close' in funcname:
+			if 'close' in funcname:
-                print(funcea)
+				print funcea
-            if funcname in plt_func:
+			if funcname in plt_func:
-                print(funcname)
+				print funcname
-                continue
+				continue
-            funcs[funcname] = funcea
+			funcs[funcname] = funcea
-    return funcs
+	return funcs
 # used for the callgraph generation.
 def get_func_names(ea):
-    funcs = {}
+	funcs = {}
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-            funcname = get_unified_funcname(funcea)
+			funcname = get_unified_funcname(funcea)
-            funcs[funcname] = funcea
+			funcs[funcname] = funcea
-    return funcs
+	return funcs
 def get_func_bases(ea):
-        funcs = {}
+		funcs = {}
-        plt_func, plt_data = processpltSegs()
+		plt_func, plt_data = processpltSegs()
-        for funcea in Functions(SegStart(ea)):
+		for funcea in Functions(SegStart(ea)):
-                funcname = get_unified_funcname(funcea)
+				funcname = get_unified_funcname(funcea)
-                if funcname in plt_func:
+				if funcname in plt_func:
-                    continue
+					continue
-                funcs[funcea] = funcname
+				funcs[funcea] = funcname
-        return funcs
+		return funcs
 def get_func_range(ea):
-        funcs = {}
+		funcs = {}
-        for funcea in Functions(SegStart(ea)):
+		for funcea in Functions(SegStart(ea)):
-                funcname = get_unified_funcname(funcea)
+				funcname = get_unified_funcname(funcea)
-        func = get_func(funcea)
+		func = get_func(funcea)
-        funcs[funcname] = (func.startEA, func.endEA)
+		funcs[funcname] = (func.startEA, func.endEA)
-        return funcs
+		return funcs
 def get_unified_funcname(ea):
-    funcname = GetFunctionName(ea)
+	funcname = GetFunctionName(ea)
-    if len(funcname) > 0:
+	if len(funcname) > 0:
-        if '.' == funcname[0]:
+		if '.' == funcname[0]:
-            funcname = funcname[1:]
+			funcname = funcname[1:]
-    return funcname
+	return funcname
 def get_func_sequences(ea):
-    funcs_bodylist = {}
+	funcs_bodylist = {}
-    funcs = get_funcs(ea)
+	funcs = get_funcs(ea)
-    for funcname in funcs:
+	for funcname in funcs:
-        if funcname not in funcs_bodylist:
+		if funcname not in funcs_bodylist:
-            funcs_bodylist[funcname] = []
+			funcs_bodylist[funcname] = []
-        for start, end in funcs[funcname]:
+		for start, end in funcs[funcname]:
-            inst_addr = start
+			inst_addr = start
-            while inst_addr <= end:
+			while inst_addr <= end:
-                opcode = GetMnem(inst_addr)
+				opcode = GetMnem(inst_addr)
-                funcs_bodylist[funcname].append(opcode)
+				funcs_bodylist[funcname].append(opcode)
-                inst_addr = NextHead(inst_addr)
+				inst_addr = NextHead(inst_addr)
-    return funcs_bodylist
+	return funcs_bodylist
 def get_func_cfgs_c(ea):
-    # type: (object) -> object
+	# type: (object) -> object
-    binary_name = idc.GetInputFile()
+	binary_name = idc.GetInputFile()
-    raw_cfgs = raw_graphs(binary_name)
+	raw_cfgs = raw_graphs(binary_name)
-    externs_eas, ea_externs = processpltSegs()
+	externs_eas, ea_externs = processpltSegs()
-    i = 0
+	i = 0
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-        funcname = get_unified_funcname(funcea)
+		funcname = get_unified_funcname(funcea)
-        func = get_func(funcea)
+		func = get_func(funcea)
-        print(i)
+		print i
-        i += 1
+		i += 1
-        icfg = cfg.getCfg(func, externs_eas, ea_externs)
+		icfg = cfg.getCfg(func, externs_eas, ea_externs)
-        func_f = get_discoverRe_feature(func, icfg[0])
+		func_f = get_discoverRe_feature(func, icfg[0])
-        bb_f = get_bb_features(func)
+		raw_g = raw_graph(funcname, icfg, func_f) #生成一个rawcfg。raw_graph是一个python class，定义在 raw_graph.py.包含g（本文的ACFG）、olg_g（discovRe的acfg）、feature（函数级别的一些特征，以及betweenness）
-        raw_g = raw_graph(funcname, icfg, func_f, bb_f)
+		raw_cfgs.append(raw_g) # raw_graphs 是另一个python class，存储raw_graph的list。定义在 raw_graph.py
-        raw_cfgs.append(raw_g) # raw_graphs 是另一个python class，存储raw_graph的list。定义在 raw_graph.py
+		#print(raw_g.__dict__)
-        #print(raw_g.__dict__)
+		#print(raw_g) 由于raw_graph、raw_graphs都是class，直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>，不能打印对象的属性。	#https://blog.51cto.com/steed/2046408 print_obj、    print(obj.__dict__)
-        #print(raw_g) 由于raw_graph、raw_graphs都是class，直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>，不能打印对象的属性。	#https://blog.51cto.com/steed/2046408 print_obj、    print(obj.__dict__)
+	return raw_cfgs
    return raw_cfgs
 def get_func_cfgs_ctest(ea):
-    binary_name = idc.GetInputFile()
+	binary_name = idc.GetInputFile()
-    raw_cfgs = raw_graphs(binary_name)
+	raw_cfgs = raw_graphs(binary_name)
-    externs_eas, ea_externs = processpltSegs()
+	externs_eas, ea_externs = processpltSegs()
-    i = 0
+	i = 0
-    diffs = {}
+	diffs = {}
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-        funcname = get_unified_funcname(funcea)
+		funcname = get_unified_funcname(funcea)
-        func = get_func(funcea)
+		func = get_func(funcea)
-        print(i)
+		print i
-        i += 1
+		i += 1
-        icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
+		icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
-        diffs[funcname] = (icfg, old_cfg)
+		diffs[funcname] = (icfg, old_cfg)
-        #raw_g = raw_graph(funcname, icfg)
+		#raw_g = raw_graph(funcname, icfg)
-        #raw_cfgs.append(raw_g)
+		#raw_cfgs.append(raw_g)
-
+			
-    return diffs
+	return diffs
 def get_func_cfgs(ea):
-    func_cfglist = {}
+	func_cfglist = {}
-    i = 0
+	i = 0
-    for funcea in Functions(SegStart(ea)):
+	for funcea in Functions(SegStart(ea)):
-        funcname = get_unified_funcname(funcea)
+		funcname = get_unified_funcname(funcea)
-        func = get_func(funcea)
+		func = get_func(funcea)
-        print(i)
+		print i
-        i += 1
+		i += 1
-        try:
+		try:
-            icfg = cfg.getCfg(func)
+			icfg = cfg.getCfg(func)
-            func_cfglist[funcname] = icfg
+			func_cfglist[funcname] = icfg
-        except:
+		except:
-            pass
+			pass
-
+			
-    return func_cfglist
+	return func_cfglist
 def get_func_cfg_sequences(func_cfglist):
-    func_cfg_seqlist = {}
+	func_cfg_seqlist = {}
-    for funcname in func_cfglist:
+	for funcname in func_cfglist:
-        func_cfg_seqlist[funcname] = {}
+		func_cfg_seqlist[funcname] = {}
-        cfg = func_cfglist[funcname][0]
+		cfg = func_cfglist[funcname][0]
-        for start, end in cfg:
+		for start, end in cfg:
-            codesq = get_sequences(start, end)
+			codesq = get_sequences(start, end)
-            func_cfg_seqlist[funcname][(start,end)] = codesq
+			func_cfg_seqlist[funcname][(start,end)] = codesq
-    return func_cfg_seqlist
+	return func_cfg_seqlist
 def get_sequences(start, end):
-    seq = []
+	seq = []
-    inst_addr = start
+	inst_addr = start
-    while inst_addr <= end:
+	while inst_addr <= end:
-        opcode = GetMnem(inst_addr)
+		opcode = GetMnem(inst_addr)
-        seq.append(opcode)
+		seq.append(opcode)
-        inst_addr = NextHead(inst_addr)
+		inst_addr = NextHead(inst_addr)
-    return seq
+	return seq
 def get_stack_arg(func_addr):
-    print(func_addr)
+	print func_addr
-    args = []
+	args = []
-    stack = GetFrame(func_addr)
+	stack = GetFrame(func_addr)
-    if not stack:
+	if not stack:
-            return []
+			return []
-    firstM = GetFirstMember(stack)
+	firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
+	lastM = GetLastMember(stack)
-    i = firstM
+	i = firstM
-    while i <=lastM:
+	while i <=lastM:
-        mName = GetMemberName(stack,i)
+		mName = GetMemberName(stack,i)
-        mSize = GetMemberSize(stack,i)
+		mSize = GetMemberSize(stack,i)
-        if mSize:
+		if mSize:
-                i = i + mSize
+				i = i + mSize
-        else:
+		else:
-                i = i+4
+				i = i+4
-        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
+		if mName not in args and mName and ' s' not in mName and ' r' not in mName:
-            args.append(mName)
+			args.append(mName)
-    return args
+	return args
    #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
 		#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
 def processExternalSegs():
-    funcdata = {}
+	funcdata = {}
-    datafunc = {}
+	datafunc = {}
-    for n in xrange(idaapi.get_segm_qty()):
+	for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
+		seg = idaapi.getnseg(n)
-        ea = seg.startEA
+		ea = seg.startEA
-        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
+		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-        if segtype in [idc.SEG_XTRN]:
+		if segtype in [idc.SEG_XTRN]:
-            start = idc.SegStart(ea)
+			start = idc.SegStart(ea)
-            end = idc.SegEnd(ea)
+			end = idc.SegEnd(ea)
-            cur = start
+			cur = start
-            while cur <= end:
+			while cur <= end:
-                name = get_unified_funcname(cur)
+				name = get_unified_funcname(cur)
-                funcdata[name] = hex(cur)
+				funcdata[name] = hex(cur)
-                cur = NextHead(cur)
+				cur = NextHead(cur)
-    return funcdata
+	return funcdata
 def processpltSegs():
-    funcdata = {}
+	funcdata = {}
-    datafunc = {}
+	datafunc = {}
-    for n in xrange(idaapi.get_segm_qty()):
+	for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
+		seg = idaapi.getnseg(n)
-        ea = seg.startEA
+		ea = seg.startEA
-        segname = SegName(ea)
+		segname = SegName(ea)
-        if segname in ['.plt', 'extern', '.MIPS.stubs']:
+		if segname in ['.plt', 'extern', '.MIPS.stubs']:
-            start = seg.startEA
+			start = seg.startEA
-            end = seg.endEA
+			end = seg.endEA
-            cur = start
+			cur = start
-            while cur < end:
+			while cur < end:
-                name = get_unified_funcname(cur)
+				name = get_unified_funcname(cur)
-                funcdata[name] = hex(cur)
+				funcdata[name] = hex(cur)
-                datafunc[cur]= name
+				datafunc[cur]= name
-                cur = NextHead(cur)
+				cur = NextHead(cur)
-    return funcdata, datafunc
+	return funcdata, datafunc
 def processDataSegs():
-    funcdata = {}
+	funcdata = {}
-    datafunc = {}
+	datafunc = {}
-    for n in xrange(idaapi.get_segm_qty()):
+	for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
+		seg = idaapi.getnseg(n)
-        ea = seg.startEA
+		ea = seg.startEA
-        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
+		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-        if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
+		if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
-            start = idc.SegStart(ea)
+			start = idc.SegStart(ea)
-            end = idc.SegEnd(ea)
+			end = idc.SegEnd(ea)
-            cur = start
+			cur = start
-            while cur <= end:
+			while cur <= end:
-                refs = [v for v in DataRefsTo(cur)]
+				refs = [v for v in DataRefsTo(cur)]
-                for fea in refs:
+				for fea in refs:
-                    name = get_unified_funcname(fea)
+					name = get_unified_funcname(fea)
-                    if len(name)== 0:
+					if len(name)== 0:
-                        continue
+						continue
-                    if name not in funcdata:
+					if name not in funcdata:
-                        funcdata[name] = [cur]
+						funcdata[name] = [cur]
-                    else:
+					else:
-                        funcdata[name].append(cur)
+						funcdata[name].append(cur)
-                    if cur not in datafunc:
+					if cur not in datafunc:
-                        datafunc[cur] = [name]
+						datafunc[cur] = [name]
-                    else:
+					else:
-                        datafunc[cur].append(name)
+						datafunc[cur].append(name)
-                cur = NextHead(cur)
+				cur = NextHead(cur)
-    return funcdata, datafunc
+	return funcdata, datafunc
 def obtainDataRefs(callgraph):
-    datarefs = {}
+	datarefs = {}
-    funcdata, datafunc = processDataSegs()
+	funcdata, datafunc = processDataSegs()
-    for node in callgraph:
+	for node in callgraph:
-        if node in funcdata:
+		if node in funcdata:
-            datas = funcdata[node]
+			datas = funcdata[node]
-            for dd in datas:
+			for dd in datas:
-                refs = datafunc[dd]
+				refs = datafunc[dd]
-                refs = list(set(refs))
+				refs = list(set(refs))
-                if node in datarefs:
+				if node in datarefs:
-                    print(refs)
+					print refs
-                    datarefs[node] += refs
+					datarefs[node] += refs
-                    datarefs[node] = list(set(datarefs[node]))
+					datarefs[node] = list(set(datarefs[node]))
-                else:
+				else:
-                    datarefs[node] = refs
+					datarefs[node] = refs
-    return datarefs
+	return datarefs
--- a/Genius3/raw-feature-extractor/func.pyc
+++ b/Genius3/raw-feature-extractor/func.pyc
--- a/Genius3/raw-feature-extractor/generate_asm_file.py
+++ b/Genius3/raw-feature-extractor/generate_asm_file.py
@ -1,24 +0,0 @@
 # coding=utf-8
 from func import *
 from idc import *
 def generate_asm_file():
    binary_name = idc.GetInputFile()
    # workflow = idc.ARGV[1]
    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
    analysis_flags &= ~idc.AF_IMMOFF
    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
    idaapi.autoWait()
    # 生成pe文件的asm文件
    idc.GenerateFile(idc.OFILE_ASM, binary_name + ".asm", 0, idc.BADADDR, 0)
    # 由于命令行模式也必须打开ida pro，因此每次结束自动关闭ida
    idc.Exit(0)
 if __name__ == '__main__':
    generate_asm_file()
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.py
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py
@ -1,4 +1,3 @@
 # coding=utf-8
 from idautils import *
 from idaapi import *
 from idc import *
@ -139,7 +138,7 @@ def get_stackVariables(func_addr):
    return len(args)
-# 计算算数指令数量
+
 def calArithmeticIns(bl):
 	x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
 	mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
@ -157,7 +156,6 @@ def calArithmeticIns(bl):
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 # 计算调用数量
 def calCalls(bl):
 	calls = {'call':1, 'jal':1, 'jalr':1}
 	start = bl[0]
@ -171,7 +169,6 @@ def calCalls(bl):
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 # 计算指令数量
 def calInsts(bl):
 	start = bl[0]
 	end = bl[1]
@ -199,23 +196,7 @@ def calLogicInstructions(bl):
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def calIns(bl, inst):
 	calls = {}
 	calls.update(inst)
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		if opcode in calls:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def calSconstants(bl):
 	calls = {}
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.pyc
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.pyc
--- a/Genius3/raw-feature-extractor/graph_property.pyc
+++ b/Genius3/raw-feature-extractor/graph_property.pyc
--- a/Genius3/raw-feature-extractor/ida_batch.py
+++ b/Genius3/raw-feature-extractor/ida_batch.py
@ -1,200 +0,0 @@
 # coding=utf-8
 import re
 import os
 import subprocess
 import multiprocessing
 from tqdm import tqdm
 import time
 # 单个pe文件处理超时/s
 # 多次处理，一批数据中只有少量文件会超时
 # 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理，若仍然超时则放弃
 TIMEOUT = 60
 # 每个家族最大处理数量
 MAX_FAMILY_PROCESS_NUM = 200
 def call_preprocess(cmd_line):
    subprocess.call(cmd_line, shell=True)
 # 良性软件分析模式，ida的命令中将workflow改为-1
 def benign_batch_mode(overhaul):
    # 总失败数据数量
    total_failed = 0
    log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
    process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
    benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
    if overhaul:
        if os.path.exists(log_path):
            os.remove(log_path)
        if os.path.exists(process_log_path):
            os.remove(process_log_path)
    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
        logged = log.readline()
        if logged == '':
            log_index = 0
        else:
            log_index = int(logged)
        pe_list = os.listdir(benign_pe_dir)
        for index, pe in enumerate(tqdm(sorted(pe_list))):
            if index < log_index:
                continue
            cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format(
                os.path.join(benign_pe_dir, pe))
            p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
            p.start()
            flag_kill = True
            start = time.time()
            while time.time() - start <= TIMEOUT:
                if not p.is_alive():
                    flag_kill = False
                    break
                else:
                    time.sleep(1)
            if flag_kill:
                subprocess.call('taskkill /im idaq64.exe /f')
                process_log.write(
                    "index {}, {} stuck, process terminated.\n".format(index, pe))
                total_failed += 1
            else:
                # 正常运行结束
                log.truncate(0)
                log.seek(0)
                log.write(str(index))
                log.flush()
                process_log.write("index {}, {} process done.\n".format(index, pe))
    # 所有副产物删除
    delete_output()
    print('总失败数{}'.format(total_failed))
 def mal_batch_mode(start, end, overhaul):
    # 只选其中这些类的pe进行分析，其他的就直接跳过
    families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
                                'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
    # 记录ida处理报错的数据来自哪些家族
    failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
                     'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
    # 总失败数据数量
    total_failed = 0
    for workflow in range(start, end):
        # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
        family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
        log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
        process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
        if overhaul:
            if os.path.exists(log_path):
                os.remove(log_path)
            if os.path.exists(process_log_path):
                os.remove(process_log_path)
        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
                                                                                            'r') as family_file:
            logged = log.readline()
            if logged == '':
                log_index = 0
            else:
                log_index = int(logged)
            families = family_file.read()
            for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
                if index < log_index:
                    continue
                # 匹配文件md5，取出family文件中该md5的家族
                regex = re.compile(pe[11:] + r'[\t][\S]*')
                search_result = regex.findall(families)
                if len(search_result) == 0:
                    continue
                pe_family = search_result[0].split()[1]
                if pe_family not in families_need_to_analyze:
                    continue
                # FOR TEST ONLY
                # cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
                #     workflow, os.path.join(pe_dir, pe))
                cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
                    workflow, os.path.join(pe_dir, pe))
                p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
                p.start()
                flag_kill = True
                start = time.time()
                while time.time() - start <= TIMEOUT:
                    if not p.is_alive():
                        flag_kill = False
                        break
                    else:
                        time.sleep(1)
                if flag_kill:
                    subprocess.call('taskkill /im idaq64.exe /f')
                    process_log.write(
                        "index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
                    failed_family[pe_family] += 1
                    total_failed += 1
                else:
                    # 正常运行结束
                    log.truncate(0)
                    log.seek(0)
                    log.write(str(index))
                    log.flush()
                    process_log.write("index {}, {} process done.\n".format(index, pe))
                    families_need_to_analyze[pe_family] += 1
        # 一次workflow结束后将所有副产物删除
        delete_output()
    print(families_need_to_analyze)
    print('\n')
    print(failed_family, '总失败数{}'.format(total_failed))
 def delete_output():
    out_dir = 'F:\\iout'
    for f in os.listdir(out_dir):
        if os.path.exists(os.path.join(out_dir, f)):
            os.remove(os.path.join(out_dir, f))
 def generate_asm_batch_mode():
    pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
    pe_list = os.listdir(pe_dir)
    for pe in tqdm(pe_list):
        cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\generate_asm_file.py" -oF:\iout {}'.format(
            os.path.join(pe_dir, pe))
        p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
        p.start()
        while True:
            if not p.is_alive():
                break
            else:
                time.sleep(1)
    delete_output()
 # 注意：该py文件必须放在IDA的根目录下，且必须使用cmd命令执行，否则无法链接到python库
 # F:\\kkk\\IDA_6.6
 if __name__ == '__main__':
    benign_batch_mode(True)
    # mal_batch_mode(35, 69, True)
    # generate_asm_batch_mode()
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -1,54 +1,56 @@
 # -*- coding: UTF-8 -*-
-import pickle
+import sys
 from func import *
 from raw_graphs import *
 from idc import *
 import os
 import argparse
 import raw_graphs
 def print_obj(obj):
    "打印对象的所有属性"
    print(obj.__dict__)
-def preprocess():
+def parse_command():
-    # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
+	parser = argparse.ArgumentParser(description='Process some integers.')
-    # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
+	parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file")
-    # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
+	args = parser.parse_args()
-    # print idc.ARGV[2]
+	return args
    # print type(idc.ARGV[2])
    binary_name = idc.GetInputFile()
    workflow = idc.ARGV[1]
    # workflow为特定值时分析良性软件，否则分析恶意软件
    if workflow == '-1':
        cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
        gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
        asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
    else:
        cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
        gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
        asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
    analysis_flags &= ~idc.AF_IMMOFF
    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
    idaapi.autoWait()
    # 生成pe文件的cfg列表
    cfgs = get_func_cfgs_c(FirstSeg())
    # 将cfg保存为.ida
    pickle.dump(cfgs, open(cfg_path, 'w'))
    # 生成pe文件的fcg，保存为.dot文件
    # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件，网上几乎找不到gdl这个格式
    idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
    # 生成.asm文件
    idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
    # 关闭IDA Pro
    idc.Exit(0)
 # 通用命令行格式  idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
 # 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path，完整命令行如下
 # F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
 # D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
 if __name__ == '__main__':
-    preprocess()
+	#E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
 	#print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
 	#print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
 	#print idc.ARGV[2]
 	#print type(idc.ARGV[2])
 	# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe  -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius4\acfgs" hpcenter
 	#测试生成原始特征的时间。
 	start_t = time.clock()
 	args = parse_command()
 	#path = args.path
 	path = idc.ARGV[2]
 	analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
 	analysis_flags &= ~idc.AF_IMMOFF
 	# turn off "automatically make offset" heuristic
 	idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
 	idaapi.autoWait()
 	cfgs = get_func_cfgs_c(FirstSeg())
 	end_t = time.clock()
 	print (end_t - start_t) #1.5934438s hpcenter 83.4 KB    #35.6745299s SCGDW698 5.5mb  #14.1480888s  762kb   SCMQTTIot     这个时间包括ida分析二进制文件的时间和脚本生成对应原始特征的时间
 	# 应该是随着函数和基本块的数量增加而线性增加的，先不写了。可能ida分析二进制文件的占比比较高
 	binary_name = idc.GetInputFile() + '.ida'
 	print path
 	print binary_name
 	fullpath = os.path.join(path, binary_name)
 	pickle.dump(cfgs, open(fullpath,'w'))
 	#print binary_name
 	#加上这句，脚本执行完就退出IDA
 	#idc.Exit(0)
--- a/Genius3/raw-feature-extractor/preprocessing_ida.pyc
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.pyc
--- a/Genius3/raw-feature-extractor/raw_graphs.py
+++ b/Genius3/raw-feature-extractor/raw_graphs.py
@ -2,26 +2,24 @@
 import itertools
 import sys
-# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
+sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
+sys.path.insert(1, 'C:/Python27/Lib/site-packages')
 import networkx as nx
-import numpy as np
+#import numpy as np
 from subprocess import Popen, PIPE
 import pdb
 import os
-import re
+import re,mmap
-import mmap
+#from graph_edit_new import *
 # from graph_edit_new import *
 class raw_graph:
-	def __init__(self, funcname, g, func_f, bb_f):
+	def __init__(self, funcname, g, func_f):
 		#print "create"
 		self.funcname = funcname
 		self.old_g = g[0]
 		self.g = nx.DiGraph()
 		self.entry = g[1]
 		self.bb_features = bb_f  # len=bb数量,每个元素都是一个11维向量
 		self.fun_features = func_f
 		self.attributing()
@ -56,9 +54,6 @@ class raw_graph:
 				offsprings[suc] = 1
 				self.getOffsprings(g, suc, offsprings)
 	# 提取acfg的属性特征
 	# 调用/传输/算术/逻辑/比较/移动/终止
 	# 数据声明/总指令数/字符串或整数常量/后代的数量
 	def retrieveVec(self, id_, g):
 		feature_vec = []
 		#numC0
@ -101,7 +96,7 @@ class raw_graph:
 	def genMotifs(self, n):
 		motifs = {}
-		subgs = self.enumerating(n)
+		subgs = enumerating(n)
 		for subg in subgs:
 			if len(motifs) == 0:
 				motifs[subg] = [subg]
@ -187,7 +182,7 @@ class raw_graph:
 			tg.updateG(fang, indexes, self.g)
 			return tg
 		pdb.set_trace()
-		print("there is g which is none")
+		print "there is g which is none"
 	def createG(self, binary_str, n):
 		g = nx.DiGraph()
--- a/Genius3/raw-feature-extractor/raw_graphs.pyc
+++ b/Genius3/raw-feature-extractor/raw_graphs.pyc
--- a/Genius3/raw-feature-extractor/read_idaFILE.py
+++ b/Genius3/raw-feature-extractor/read_idaFILE.py
@ -1,71 +1,70 @@
 # -*- coding: UTF-8 -*-
 import sys
 import sys
 from matplotlib import pyplot as plt
 sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
 sys.path.insert(1, 'C:/Python27/Lib/site-packages')
 import networkx as nx
 import pickle
 # sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
 # sys.path.insert(1, 'C:/Python27/Lib/site-packages')
 def print_obj(obj):
-    # "打印对象的所有属性"
+    "打印对象的所有属性"
    print(obj.__dict__)
 import pickle
-# sub_10F20 308  反编译代码有字符串，但是这个特征提取里没有字符串 constant，可能是间接引用的，不识别。看了下所有函数的特征，几乎都没有字符串常量，可能都是写在别的地方然后引用的。
+#sub_10F20 308  反编译代码有字符串，但是这个特征提取里没有字符串 constant，可能是间接引用的，不识别。看了下所有函数的特征，几乎都没有字符串常量，可能都是写在别的地方然后引用的。
-# sub_166C4 393
+#sub_166C4 393
 if __name__ == '__main__':
-    testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
+
    testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
    fr = open(testpath, 'r')
-    data = pickle.load(fr) #一个二进制文件的acfgs
+    data1 = pickle.load(fr) #一个二进制文件的acfgs
-    fr.close()
+    #print(type(data1))
    #print_obj(data1)
    #print data1.raw_graph_list[393]
    #print_obj(data1.raw_graph_list[393])
    #nx.draw(data1.raw_graph_list[393].g,with_labels=True)
    #plt.show()
-    # print(type(data1))
+    print "一个二进制文件的所有函数的原始特征，list。"
-    # print_obj(data1)
+    print_obj(data1) #acfg list
-    # print data1.raw_graph_list[393]
+    print "\n"
    # print_obj(data1.raw_graph_list[393])
    # nx.draw(data1.raw_graph_list[393].g,with_labels=True)
    # plt.show()
-    print("一个二进制文件的所有函数的原始特征，list。")
+    print "一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成"
-    print_obj(data)  # acfg list
+    print_obj(data1.raw_graph_list[393]) #一个函数的acfg
-    print("\n")
+    print "\n"
-
+    feature=data1.raw_graph_list[393].fun_features
-    print("一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成")
+    print "函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts"
-    print_obj(data.raw_graph_list[0])  # 一个函数的acfg
+    print feature
-    print("其中fun_features = 函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
+    print "\n"
    # feature = data.raw_graph_list[0].fun_features
    print("old_g:{}".format(data.raw_graph_list[0].old_g))
    print("g:{}".format(data.raw_graph_list[0].g))
-    # G = data1.raw_graph_list[393].old_g
+    # G=data1.raw_graph_list[393].old_g
    # print G.node[0] # G.node[i]是dict
    # for key, value in G.node[0].items():
    #     print('{key}:{value}'.format(key=key, value=value))
-    # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
+    # 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
-    G = data.raw_graph_list[0].g
+    G=data1.raw_graph_list[393].g
-    print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
+    print "# 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量"
-    # print(G.node[0])
+    print G.node[0]
-    # print("\n")
+    print "\n"
-    # 函数内所有基本快的特征
+    # for key, value in G.node[0].items():
-    for key, value in G.node.items():
+    #     print('{key}:{value}'.format(key=key, value=value))
        print('{}:{}'.format(key, value))
    #oldg就是读取IDA的CFG，所以数量、方向等都一样；g根据old_g生成，也一样
    #old g
-    G = data.raw_graph_list[0].old_g
+    G = data1.raw_graph_list[393].old_g
-    nx.draw(G, with_labels=True)
+    nx.draw(G,with_labels=True)
    #plt.title('old_g')
    plt.show()
    # g
-    G = data.raw_graph_list[0].g
+    G = data1.raw_graph_list[393].g
-    nx.draw(G, with_labels=True)
+    nx.draw(G,with_labels=True)
    #plt.title('Genius_g')
    plt.show()
--- a/Genius3/raw-feature-extractor/test.py
+++ b/Genius3/raw-feature-extractor/test.py
@ -1,380 +1,8 @@
 # coding=utf-8
 import re
 import os
 import subprocess
 import time
 import json
 import random
 import shutil
 from tqdm import tqdm
 import csv
 import pandas as pd
-
+import pickle
-def create_dir():
+testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
-    parent_dir = "D:\\hkn\\infected\\datasets"
+fr = open(testpath, 'r')
-    for workflow in range(40, 70):
+data1 = pickle.load(fr)
-        # 生成raw data文件夹
+print(type(data1))
-        infected = "virusshare_infected{}".format(workflow)
+# # print_obj(data1)
-        cfg = "virusshare_infected{}_cfg".format(workflow)
+# print cfgs.raw_graph_list[0]
        dot = "virusshare_infected{}_dot".format(workflow)
        jsonl = "virusshare_infected{}_json".format(workflow)
        create(parent_dir, infected)
        create(parent_dir, cfg)
        create(parent_dir, dot)
        create(parent_dir, jsonl)
        # iout = "virusshare_infected{}_iout".format(workflow)
        # os.rmdir(os.path.join(parent_dir, iout))
        # os.rmdir(os.path.join(parent_dir, ida))
 def create(parent_dir, folder):
    if not os.path.exists(os.path.join(parent_dir, folder)):
        os.mkdir(os.path.join(parent_dir, folder))
 def change_max_item_lines():
    f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
    s = f.read()
    f.close()
    index = s.find(b'MAX_ITEM_LINES          = 5000')
    news = s.replace(b'MAX_ITEM_LINES          = 5000', b'MAX_ITEM_LINES          = 50000')
    # print(news[index:index+50])
    f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'wb')
    f.write(news)
    f.close()
 def clock():
    TIMEOUT = 10
    start = time.time()
    flag_kill = True
    while time.time() - start <= TIMEOUT:
        if not p.is_alive():
            flag_kill = False
            break
        else:
            time.sleep(1)  # Just to avoid hogging the CPU
    if flag_kill:
        subprocess.call('taskkill /im idaq64.exe /f')
 def delete_error():
    for workflow in range(0, 35):
        convert_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
        json_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
        with open(convert_log_path, 'r') as log:
            for line in log:
                if 'Error occurred' in line:
                    name = line[line.find(',') + 2: line.find('.')] + '.jsonl'
                    # print(os.path.join(json_dir, name))
                    if os.path.exists(os.path.join(json_dir, name)):
                        os.remove(os.path.join(json_dir, name))
 def check_json():
    print('start checking json')
    for workflow in tqdm(range(0, 69)):
        json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
        for json_file in os.listdir(json_dir):
            f = open(os.path.join(json_dir, json_file), 'r')
            try:
                data = json.load(f)
            except UnicodeDecodeError:
                continue
            finally:
                f.close()
            if len(data['function_edges'][0]) == 0:
                print("{} {} function_edges null\n".format(workflow, json_file))
                # continue
            # for acfg in data['acfg_list']:
            #     if acfg['block_number'] != len(acfg['block_features']):
            #         print("{} {}\n".format(workflow, json_file))
 # 临时函数，删除所有jsonl文件
 def delete_jsonl():
    for workflow in range(0, 35):
        json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
        for f in os.listdir(json_dir):
            os.remove(os.path.join(json_dir, f))
 def delete_all_local():
    data_dirs = ['D:\\hkn\\infected\\datasets\\virusshare_train\\1',
                 'D:\\hkn\\infected\\datasets\\virusshare_train\\2',
                 'D:\\hkn\\infected\\datasets\\virusshare_train\\3',
                 'D:\\hkn\\infected\\datasets\\virusshare_train\\4',
                 'D:\\hkn\\infected\\datasets\\virusshare_train\\5',
                 ]
    for d in data_dirs:
        path = os.listdir(d)
        for f in path:
            os.remove(os.path.join(d, f))
 # 重命名pt文件使之与代码相符
 def rename(mal_or_be, postfix):
    tag_set = ['train', 'test', 'valid']
    for tag in tag_set:
        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
        for index, f in enumerate(os.listdir(data_dir)):
            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
    for tag in tag_set:
        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
        for index, f in enumerate(os.listdir(data_dir)):
            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
 def split_data_by_label():
    all = 'D:\\hkn\\infected\\datasets\\virusshare_train\\all_pt'
    dest = 'D:\\hkn\\infected\\datasets\\virusshare_train'
    csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
    with open(csv_path, 'r') as label:
        label.readline()
        labels = label.readlines()
        for lines in labels:
            name, cls = lines.strip().split(',')
            fpath = os.path.join(all, name + '.pt')
            if os.path.exists(fpath):
                shutil.move(fpath, os.path.join(dest, cls))
            else:
                print(fpath, 'file not exist.')
 def half_divide():
    src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
    test = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware'
    valid = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'
    flag = True
    for f in os.listdir(src):
        if 'pt' not in f:
            continue
        if flag:
            shutil.copy(os.path.join(src, f), test)
        else:
            shutil.copy(os.path.join(src, f), valid)
        flag = not flag
 def copy_train_data():
    all = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
    dest = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\train_malware'
    train = set(os.listdir(all)) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware')) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'))
    for f in train:
        shutil.copy(os.path.join(all, f), dest)
 def clear_dot():
    for workflow in range(0, 35):
        path = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\'.format(workflow)
        for name in os.listdir(path):
            full = os.path.join(path, name)
            f = open(full, 'r')
            data = f.read()
            f.close()
            if 'start' not in data and 'sub_' not in data:
                # print("delete")
                os.remove(full)
 def read_test():
    dot_file_path = "D:\\hkn\\infected\\datasets\\virusshare_infected23_dot\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.dot"
    with open(dot_file_path, 'r') as dot:
        for line in dot:
            if '->' in line:
                print(re.findall(r'\b\d+\b', line))
            elif 'label' in line:
                print(line[line.find('= "') + 3:line.find('",')])
 # 临时工具，有些pe文件没有经过api分类，直接删掉
 def del_redundant():
    for workflow in range(0, 68):
        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
        family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
        with open(family_file_path, 'r') as f_file:
            family = f_file.read()
            for name in os.listdir(pe_dir):
                if name[11:] in family:
                    continue
                else:
                    # print(name)
                    os.remove(os.path.join(pe_dir, name))
 def delete_pe():
    dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot'
    cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg'
    dot_list = os.listdir(dot_dir)
    for cfg in os.listdir(cfg_dir):
        name = cfg[:-4] + ".dot"
        if name in dot_list:
            continue
        else:
            print(os.path.join(dot_dir, name))
            # os.remove(os.path.join(dot_dir, cfg))
 def delete_error_benign():
    jsonl_dir = 'F:\\kkk\\dataset\\benign\\refind_jsonl'
    dot_dir = 'F:\\kkk\\dataset\\benign\\refind_dot'
    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
    asm_dir = "F:\\kkk\\dataset\\benign\\refind_asm"
    pe_dir = "F:\\kkk\\dataset\\benign\\refind"
    alist = os.listdir(pe_dir)
    for f in alist:
        if not os.path.exists(os.path.join(jsonl_dir, f + '.jsonl')):
            os.remove(os.path.join(pe_dir, f))
            if os.path.exists(os.path.join(asm_dir, f + '.asm')):
                os.remove(os.path.join(asm_dir, f + '.asm'))
            if os.path.exists(os.path.join(cfg_dir, f + '.ida')):
                os.remove(os.path.join(cfg_dir, f + '.ida'))
            if os.path.exists(os.path.join(dot_dir, f + '.dot')):
                os.remove(os.path.join(dot_dir, f + '.dot'))
 def generate_benign_csv():
    benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
    csv_out = 'F:\\kkk\\dataset\\benign_family.csv'
    fieldnames = ['Id', 'Class']
    with open(csv_out, "wb") as output_file:
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        for f in os.listdir(benign_pe_dir):
            writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
 def process_csv():
    csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
    files = os.listdir('D:\\hkn\\infected\\datasets\\virusshare_train\\pe')
    print(files.__len__())
    df = df[df['Id'].isin(files)]
    df = df.drop_duplicates('Id')
    df['Id'] = 'VirusShare_' + df['Id']
    df.to_csv(csv_path, index=False)
 def generate_virusshare_csv():
    index = {'wacatac': 1, 'ulpm': 2, 'fugrafa': 3, 'redcap': 4}
    fieldnames = ['Id', 'Class']
    pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
    family_dir = 'D:\\hkn\\infected\\datasets\\virusshare_family'
    csv_out = 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
    with open(csv_out, "wb") as output_file:
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        for f in tqdm(os.listdir(family_dir)):
            with open(os.path.join(family_dir, f), 'r') as family:
                lines = family.readlines()
                for line in lines:
                    md5, label = line.strip().split('\t')
                    if label in index:
                        if os.path.exists(os.path.join(pe_dir, 'VirusShare_' + md5)):
                            writer.writerow({fieldnames[0]: 'VirusShare_' + md5, fieldnames[1]: index[label]})
 def findlostone():
    pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
    asm_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\asm'
    for f in os.listdir(pe_dir):
        if not os.path.exists(os.path.join(asm_dir, f + '.asm')):
            print(f)
 def find_pe_in_original_set():
    for workflow in range(0, 69):
        data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
        for f in os.listdir(data_dir):
            if f[:-6] == 'VirusShare_0f07b29873cf503a0fb69fa064ce76a3':
                print(workflow)
                return
 def select_jsonl():
    csv_paths = 'F:\\kkk\\dataset\\virusshare_family.csv'
    jsonl_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\malware_jsonl'
    with open(csv_paths, 'r') as csv_path:
        labels = csv.reader(csv_path, delimiter=',')
        data = list(labels)
        for workflow in range(0, 69):
            data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
            for f in os.listdir(data_dir):
                for line in data:
                    if f[:-6] in line:
                        shutil.copy(os.path.join(data_dir, f), jsonl_dir)
                        break
 def generate_csv():
    pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\pe'
    csv_path = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\virusshare_5.csv'
    fieldnames = ['Id', 'Class']
    with open(csv_path, "wb") as output_file:
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        for pe in os.listdir(pe_dir):
            writer.writerow({fieldnames[0]: pe, fieldnames[1]: 5})
 def merge_csvs(cs, out):
    for i, c in enumerate(cs):
        if i == 0:
            merged = pd.read_csv(c)
        else:
            merged = pd.merge(pd.read_csv(c), merged, on='Id')
            # merged = pd.concat([merged, pd.read_csv(c)])
    # if 'Class' in merged:
    #     merged['Class'] = merged['Class'] - 1
    merged.to_csv(out, index=False)
 if __name__ == '__main__':
    # find_pe_in_original_set()
    # split_data_by_label()
    # select_jsonl()
    # findlostone()
    # generate_csv()
    # generate_virusshare_csv()
    # merge_csvs([
    #     'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_1_compliment.csv',
    #     'D:\\hkn\\infected\\datasets\\virusshare_family.csv',
    #     'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_5.csv',
    # ],
    #     'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
    # )
    process_csv()
    # generate_benign_csv()
    # create_pixel_intensity()
    # create_dir()
    # change_max_item_lines()
    # subprocess.call('taskkill /im idaq64.exe /f')
    # delete_error_benign()
    # test()
    # delete_jsonl()
    # delete_all_local()
    # check_json()
    # delete_pe()
    # rename('malware', '_backup')
    # 指定 'standard' or 'benign' or 'one_family'
    # standard表示处理所有恶意样本
    # split_samples()
    # one_family表示仅处理一个家族，仅用于测试原模型的二分类
    # split_samples('one_family')
    # benign表示处理良性样本
    # split_samples('benign')
    # half_divide()
    # copy_train_data()
    # clear_dot()
    # read_test()
    # del_redundant()
--- a/ida_file_cerate.bat
+++ b/ida_file_cerate.bat
@ -1,16 +0,0 @@
@echo off
 setlocal EnableDelayedExpansion
 set "FOLDER_PATH=D:\bishe\dataset\train_benign"
 for %%f in ("%FOLDER_PATH%\*") do (
    echo !time! %%f
    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oD:\bishe\dataset\out %%f
 )
 endlocal
--- a/ida_file_cerate_malware.bat
+++ b/ida_file_cerate_malware.bat
@ -1,16 +0,0 @@
@echo off
 setlocal EnableDelayedExpansion
 set "FOLDER_PATH=D:\bishe\dataset\train_malware"
 for %%f in ("%FOLDER_PATH%\*") do (
    echo !time! %%f
    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
 )
 endlocal
--- a/raw-feature-extractor/cfg_constructor.py
+++ b/raw-feature-extractor/cfg_constructor.py
@ -0,0 +1,286 @@
 import copy
 import networkx as nx
 from idautils import *
 from idaapi import *
 from idc import *
 import copy
 import networkx as nx
 from idautils import *
 from idaapi import *
 from idc import *
 from graph_analysis_ida import *
 def getCfg(func, externs_eas, ea_externs):
 	func_start = func.startEA
 	func_end = func.endEA
 	cfg = nx.DiGraph()
 	control_blocks, main_blocks = obtain_block_sequence(func)
 	i = 0
 	visited = {}
 	start_node = None
 	for bl in control_blocks:
 		start = control_blocks[bl][0]
 		end = control_blocks[bl][1]
 		src_node = (start, end)
 		if src_node not in visited:
 			src_id = len(cfg)
 			visited[src_node] = src_id
 			cfg.add_node(src_id)
 			cfg.node[src_id]['label'] = src_node
 		else:
 			src_id = visited[src_node]
 		#if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
 		if start == func_start:
 			cfg.node[src_id]['c'] = "start"
 			start_node = src_node
 		if end == func_end:
 			cfg.node[src_id]['c'] = "end"
 		#print control_ea, 1
 		refs = CodeRefsTo(start, 0)
 		for ref in refs:
 			if ref in control_blocks:
 				dst_node = control_blocks[ref]
 				if dst_node not in visited:
 					visited[dst_node] = len(cfg)
 				dst_id = visited[dst_node]
 				cfg.add_edge(dst_id, src_id)
 				cfg.node[dst_id]['label'] = dst_node
 		#print control_ea, 1
 		refs = CodeRefsTo(start, 1)
 		for ref in refs:
 			if ref in control_blocks:
 				dst_node = control_blocks[ref]
 				if dst_node not in visited:
 					visited[dst_node] = len(cfg)
 				dst_id = visited[dst_node]
 				cfg.add_edge(dst_id, src_id)
 				cfg.node[dst_id]['label'] = dst_node
 	#print "attributing"
 	attributingRe(cfg, externs_eas, ea_externs)
 	# removing deadnodes
 	#old_cfg = copy.deepcopy(cfg)
 	#transform(cfg)
 	return cfg, 0
 def transform(cfg):
 	merging(cfg)
 	filtering(cfg)
 def merging(cfg):
 	bb_ids = cfg.nodes()
 	for bb_id in bb_ids:
 		try:
 			bb = cfg.node[bb_id]['label']
 			bb_start = bb[0]
 			bb_end = bb[1]
 			succs = cfg.successors(bb_id)
 			#preds = cfg.predecessors(bb_id)
 			if len(succs) == 1:
 				preds = cfg.predecessors(succs[0])
 				if len(preds) == 1:
 					domerge(cfg, bb_id, succs[0])
 		except:
 			pass
 def domerge(cfg, bb_id, suc_node):
 	suc_nodes = cfg.successors(suc_node)
 	for node in suc_nodes:
 		cfg.add_edge(bb_id, node)
 	cfg.remove_node(suc_node)
 def filtering(cfg):
 	rm_sets = []
 	for bb_id in cfg:
 		bb = cfg.node[bb_id]['label']
 		bb_start = bb[0]
 		bb_end = bb[1]
 		re = remove(bb_start, bb_end)
 		print bb_id, re, bb_start, bb_end
 		if re:
 			print re, bb_id
 			rm_sets.append(bb_id)
 	print rm_sets
 	for bb_id in rm_sets:
 		cfg.remove_node(bb_id)
 def remove(bb_start, bb_end):
 	seqs = getSequences(bb_start, bb_end)
 	if matchseq(seqs):
 		return True
 	return False
 def matchseq(seqs):
 	mips = set(['lw', "jr", "addiu"])
 	x86 = set(['add', 'pop', 'retn'])
 	b_mips = set(['b', ('move','$v0')])
 	b_x86 = set(['b', ('mov','$eax')])
 	re_mips = set([('move','$v0')])
 	re_x86 = set([('mov','$eax')])
 	diff_mips = set(seqs).difference(set(mips))
 	if len(diff_mips) == 0:
 		return True
 	diff_x86 = set(seqs).difference(set(x86))
 	if len(diff_x86) == 0:
 		return True
 	if set(seqs) == b_mips:
 		return True
 	if set(seqs) == b_x86:
 		return True
 	if set(seqs) == re_mips:
 		return True
 	if set(seqs) == re_x86:
 		return True
 	return False
 def attributingRe(cfg, externs_eas, ea_externs):
 	for node_id in cfg:
 		bl = cfg.node[node_id]['label']
 		numIns = calInsts(bl)
 		cfg.node[node_id]['numIns'] = numIns
 		numCalls = calCalls(bl)
 		cfg.node[node_id]['numCalls'] = numCalls
 		numLIs = calLogicInstructions(bl)
 		cfg.node[node_id]['numLIs'] = numLIs
 		numAs = calArithmeticIns(bl)
 		cfg.node[node_id]['numAs'] = numAs
 		strings, consts = getBBconsts(bl)
 		cfg.node[node_id]['numNc'] = len(strings) + len(consts)
 		cfg.node[node_id]['consts'] = consts
 		cfg.node[node_id]['strings'] = strings
 		externs = retrieveExterns(bl, ea_externs)
 		cfg.node[node_id]['externs'] = externs
 		numTIs = calTransferIns(bl)
 		cfg.node[node_id]['numTIs'] = numTIs
 def attributing(cfg):
 	ga = graph_analysis()
 	ga.gwithoffspring(cfg)
 	print "finishing offspring"
 	for node in cfg:
 		stmt_num = getStmtNum(node)
 		binary_value = getBinaryValue(node)
 		cfg.node[node]['stmt_num'] = stmt_num
 		cfg.node[node]['binary_value'] = binary_value
 	ga.domChecking(cfg)
 	print "finishing domChecking"
 	ga.loopChecking(cfg)
 	print "finishing loopChecking"
 def getStmtNum(node):
 	start = node[0]
 	end = node[1]
 	stmt_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		inst_addr = NextHead(inst_addr)
 		stmt_num += 1
 	return stmt_num
 def getBinaryValue(node):
 	start = node[0]
 	inst_addr = NextHead(start)
 	value = 0
 	addr = 0
 	for x in xrange((inst_addr - start)-1):
 		addr = start + x
 		y = GetOriginalByte(addr)
 		print value, addr, y
 		value = value | y
 		value = value << 8
 		print value
 	addr = inst_addr - 1
 	y = GetOriginalByte(addr)
 	print value, addr, y
 	value = value | y
 	print node
 	print bin(value)
 	return value
 def cfg_construct(func):
 	func_start = func.startEA
 	func_end = func.endEA
 	cfg = nx.DiGraph()
 	seq_blocks, main_blocks = obtain_block_sequence(func)
 	i = 0
 	visited = {}
 	for bl in seq_blocks:
 		start = seq_blocks[bl][0]
 		end = seq_blocks[bl][1]
 		src_node = (start, end)
 		if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
 						next_start = seq_blocks[end][0]
 						next_end = seq_blocks[end][1]
 						next_node = (next_start, next_end)
 						cfg.add_edge(src_node, next_node)
 		if start == func_start:
 			cfg.add_node(src_node, c='start')
 			start_node = src_node
 		if end == func_end:
 			cfg.add_node(src_node, c='end')
 		refs = CodeRefsFrom(PrevHead(end), 0)
 		for ref in refs:
 						#print ref
 						if ref in seq_blocks:
 								dst_node = (seq_blocks[ref][0], seq_blocks[ref][1])
 								cfg.add_edge(src_node, dst_node)
 	return cfg, start_node
 def obtain_allpaths( cfg, node, path, allpaths):
 	path.append(node)
 	if 'c' in cfg.node[node] and cfg.node[node]['c'] == 'end':
 		allpaths.append(path)
 		return
 	else:
 		for suc in cfg.successors(node):
 						if suc not in path:
 								path_copy = copy.copy(path)
 								obtain_allpaths(cfg, suc, path_copy, allpaths)
 def obtain_block_sequence(func):
 	control_blocks = {}
 	main_blocks = {}
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	for bl in blocks:
 		base = bl[0]
 		end = PrevHead(bl[1])
 		control_ea = checkCB(bl)
 		control_blocks[control_ea] = bl
 		control_blocks[end] = bl
 		if func.startEA <= base <= func.endEA:
 						main_blocks[base] = bl
 		x = sorted(main_blocks)
 	return control_blocks, x
 def checkCB(bl):
 	start = bl[0]
 	end = bl[1]
 	ea = start
 	while ea < end:
 		if checkCondition(ea):
 			return ea
 		ea = NextHead(ea)
 	return PrevHead(end)
 def checkCondition(ea):
 	mips_branch = {"beqz":1, "beq":1, "bne":1, "bgez":1, "b":1, "bnez":1, "bgtz":1, "bltz":1, "blez":1, "bgt":1, "bge":1, "blt":1, "ble":1, "bgtu":1, "bgeu":1, "bltu":1, "bleu":1}
 	x86_branch = {"jz":1, "jnb":1, "jne":1, "je":1, "jg":1, "jle":1, "jl":1, "jge":1, "ja":1, "jae":1, "jb":1, "jbe":1, "jo":1, "jno":1, "js":1, "jns":1}
 	arm_branch = {"B":1, "BAL":1, "BNE":1, "BEQ":1, "BPL":1, "BMI":1, "BCC":1, "BLO":1, "BCS":1, "BHS":1, "BVC":1, "BVS":1, "BGT":1, "BGE":1, "BLT":1, "BLE":1, "BHI":1 ,"BLS":1 }
 	conds = {}
 	conds.update(mips_branch)
 	conds.update(x86_branch)
 	opcode = GetMnem(ea)
 	if opcode in conds:
 		return True
 	return False
--- a/raw-feature-extractor/discovRe.py
+++ b/raw-feature-extractor/discovRe.py
@ -0,0 +1,228 @@
 #
 # Reference Lister
 #
 # List all functions and all references to them in the current section.
 #
 # Implemented with the idautils module
 #
 import networkx as nx
 import cPickle as pickle
 import pdb
 from graph_analysis_ida import *
 from graph_property import *
 #import wingdbstub
 #wingdbstub.Ensure()
 def get_funcs(ea):
        funcs = {}
        # Get current ea
        # Loop from start to end in the current segment
 	for funcea in Functions(SegStart(ea)):
 		funcname = GetFunctionName(funcea)
 		func = get_func(funcea)
 		blocks = FlowChart(func)
 		funcs[funcname] = []
 		for bl in blocks:
 		        start = bl.startEA
 		        end = bl.endEA
 		        funcs[funcname].append((start, end))
        return funcs
 def get_funcs_for_discoverRe(ea):
    features = {}
    for funcea in Functions(SegStart(ea)):
        funcname = GetFunctionName(funcea)
        print funcname
        func = get_func(funcea)
        feature = get_discoverRe_feature(func)
        features[funcname] = feature
    return features
 def get_discoverRe_feature(func, icfg):
    start = func.startEA
    end = func.endEA
    features = []
    FunctionCalls = getFuncCalls(func)
    #1
    features.append(FunctionCalls)
    LogicInstr = getLogicInsts(func)
    #2
    features.append(LogicInstr)
    Transfer = getTransferInsts(func)
    #3
    features.append(Transfer)
    Locals = getLocalVariables(func)
    #4
    features.append(Locals)
    BB = getBasicBlocks(func)
    #5
    features.append(BB)
    Edges = len(icfg.edges())
    #6
    features.append(Edges)
    Incoming = getIncommingCalls(func)
    #7
    features.append(Incoming)
    #8
    Instrs = getIntrs(func)
    features.append(Instrs)
    between = retrieveGP(icfg)
    #9
    features.append(between)
    strings, consts = getfunc_consts(func)
    features.append(strings)
    features.append(consts)
    return features
 def get_func_names(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
            funcname = GetFunctionName(funcea)
            funcs[funcname] = funcea
    return funcs
 def get_func_bases(ea):
        funcs = {}
        for funcea in Functions(SegStart(ea)):
                funcname = GetFunctionName(funcea)
                funcs[funcea] = funcname
        return funcs
 def get_func_range(ea):
        funcs = {}
        for funcea in Functions(SegStart(ea)):
                funcname = GetFunctionName(funcea)
 		func = get_func(funcea)
                funcs[funcname] = (func.startEA, func.endEA)
        return funcs
 def get_func_sequences(ea):
 	funcs_bodylist = {}
 	funcs = get_funcs(ea)
 	for funcname in funcs:
 		if funcname not in funcs_bodylist:
 			funcs_bodylist[funcname] = []
 		for start, end in funcs[funcname]:
 			inst_addr = start
 			while inst_addr <= end:
 				opcode = GetMnem(inst_addr)
 				funcs_bodylist[funcname].append(opcode)
 				inst_addr = NextHead(inst_addr)
        return funcs_bodylist
 def get_func_cfgs(ea):
    func_cfglist = {}
    i = 0
    start, end = get_section('LOAD')
    #print start, end
    for funcea in Functions(SegStart(ea)):
        if start <= funcea <= end:
            funcname = GetFunctionName(funcea)
            func = get_func(funcea)
            print i
            i += 1
            try:
                icfg = cfg.cfg_construct(func)
                func_cfglist[funcname] = icfg
            except:
                pass
    return func_cfglist
 def get_section(t):
    base = SegByName(t)
    start = SegByBase(base)
    end = SegEnd(start)
    return start, end
 def get_func_cfg_sequences(func_cfglist):
    func_cfg_seqlist = {}
    for funcname in func_cfglist:
        func_cfg_seqlist[funcname] = {}
        cfg = func_cfglist[funcname][0]
        for start, end in cfg:
            codesq = get_sequences(start, end)
            func_cfg_seqlist[funcname][(start,end)] = codesq
    return func_cfg_seqlist
 def get_sequences(start, end):
    seq = []
    inst_addr = start
    while inst_addr <= end:
        opcode = GetMnem(inst_addr)
        seq.append(opcode)
        inst_addr = NextHead(inst_addr)
    return seq
 def get_stack_arg(func_addr):
    print func_addr
    args = []
    stack = GetFrame(func_addr)
    if not stack:
            return []
    firstM = GetFirstMember(stack)
    lastM = GetLastMember(stack)
    i = firstM
    while i <=lastM:
        mName = GetMemberName(stack,i)
        mSize = GetMemberSize(stack,i)
        if mSize:
                i = i + mSize
        else:
                i = i+4
        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
            args.append(mName)
    return args
        #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
 def processDataSegs():
    funcdata = {}
    datafunc = {}
    for n in xrange(idaapi.get_segm_qty()):
        seg = idaapi.getnseg(n)
        ea = seg.startEA
        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
        if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
            start = idc.SegStart(ea)
            end = idc.SegEnd(ea)
            cur = start
            while cur <= end:
                refs = [v for v in DataRefsTo(cur)]
                for fea in refs:
                    name = GetFunctionName(fea)
                    if len(name)== 0:
                        continue
                    if name not in funcdata:
                        funcdata[name] = [cur]
                    else:
                        funcdata[name].append(cur)
                    if cur not in datafunc:
                        datafunc[cur] = [name]
                    else:
                        datafunc[cur].append(name)
                cur = NextHead(cur)
    return funcdata, datafunc
 def obtainDataRefs(callgraph):
    datarefs = {}
    funcdata, datafunc = processDataSegs()
    for node in callgraph:
        if node in funcdata:
            datas = funcdata[node]
            for dd in datas:
                refs = datafunc[dd]
                refs = list(set(refs))
                if node in datarefs:
                    print refs
                    datarefs[node] += refs
                    datarefs[node] = list(set(datarefs[node]))
                else:
                    datarefs[node] = refs
    return datarefs
--- a/raw-feature-extractor/func.py
+++ b/raw-feature-extractor/func.py
@ -0,0 +1,285 @@
 #
 # Reference Lister
 #
 # List all functions and all references to them in the current section.
 #
 # Implemented with the idautils module
 #
 from idautils import *
 from idaapi import *
 from idc import *
 import networkx as nx
 import cfg_constructor as cfg
 import cPickle as pickle
 import pdb
 from raw_graphs import *
 #from discovRe_feature.discovRe import *
 from discovRe import *
 #import wingdbstub
 #wingdbstub.Ensure()
 def gt_funcNames(ea):
 	funcs = []
 	plt_func, plt_data = processpltSegs()
 	for funcea in Functions(SegStart(ea)):
 			funcname = get_unified_funcname(funcea)
 			if funcname in plt_func:
 				print funcname
 				continue
 			funcs.append(funcname)
 	return funcs
 def get_funcs(ea):
 	funcs = {}
 		# Get current ea
 		# Loop from start to end in the current segment
 	plt_func, plt_data = processpltSegs()
 	for funcea in Functions(SegStart(ea)):
 		funcname = get_unified_funcname(funcea)
 		if funcname in plt_func:
 			continue
 		func = get_func(funcea)
 		blocks = FlowChart(func)
 		funcs[funcname] = []
 		for bl in blocks:
 				start = bl.startEA
 				end = bl.endEA
 				funcs[funcname].append((start, end))
 	return funcs
 # used for the callgraph generation.
 def get_func_namesWithoutE(ea):
 	funcs = {}
 	plt_func, plt_data = processpltSegs()
 	for funcea in Functions(SegStart(ea)):
 			funcname = get_unified_funcname(funcea)
 			if 'close' in funcname:
 				print funcea
 			if funcname in plt_func:
 				print funcname
 				continue
 			funcs[funcname] = funcea
 	return funcs
 # used for the callgraph generation.
 def get_func_names(ea):
 	funcs = {}
 	for funcea in Functions(SegStart(ea)):
 			funcname = get_unified_funcname(funcea)
 			funcs[funcname] = funcea
 	return funcs
 def get_func_bases(ea):
 		funcs = {}
 		plt_func, plt_data = processpltSegs()
 		for funcea in Functions(SegStart(ea)):
 				funcname = get_unified_funcname(funcea)
 				if funcname in plt_func:
 					continue
 				funcs[funcea] = funcname
 		return funcs
 def get_func_range(ea):
 		funcs = {}
 		for funcea in Functions(SegStart(ea)):
 				funcname = get_unified_funcname(funcea)
 		func = get_func(funcea)
 		funcs[funcname] = (func.startEA, func.endEA)
 		return funcs
 def get_unified_funcname(ea):
 	funcname = GetFunctionName(ea)
 	if len(funcname) > 0:
 		if '.' == funcname[0]:
 			funcname = funcname[1:]
 	return funcname
 def get_func_sequences(ea):
 	funcs_bodylist = {}
 	funcs = get_funcs(ea)
 	for funcname in funcs:
 		if funcname not in funcs_bodylist:
 			funcs_bodylist[funcname] = []
 		for start, end in funcs[funcname]:
 			inst_addr = start
 			while inst_addr <= end:
 				opcode = GetMnem(inst_addr)
 				funcs_bodylist[funcname].append(opcode)
 				inst_addr = NextHead(inst_addr)
 	return funcs_bodylist
 def get_func_cfgs_c(ea):
 	binary_name = idc.GetInputFile()
 	raw_cfgs = raw_graphs(binary_name)
 	externs_eas, ea_externs = processpltSegs()
 	i = 0
 	for funcea in Functions(SegStart(ea)):
 		funcname = get_unified_funcname(funcea)
 		func = get_func(funcea)
 		print i
 		i += 1
 		icfg = cfg.getCfg(func, externs_eas, ea_externs)
 		func_f = get_discoverRe_feature(func, icfg[0])
 		raw_g = raw_graph(funcname, icfg, func_f)
 		raw_cfgs.append(raw_g)
 	return raw_cfgs
 def get_func_cfgs_ctest(ea):
 	binary_name = idc.GetInputFile()
 	raw_cfgs = raw_graphs(binary_name)
 	externs_eas, ea_externs = processpltSegs()
 	i = 0
 	diffs = {}
 	for funcea in Functions(SegStart(ea)):
 		funcname = get_unified_funcname(funcea)
 		func = get_func(funcea)
 		print i
 		i += 1
 		icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
 		diffs[funcname] = (icfg, old_cfg)
 		#raw_g = raw_graph(funcname, icfg)
 		#raw_cfgs.append(raw_g)
 	return diffs
 def get_func_cfgs(ea):
 	func_cfglist = {}
 	i = 0
 	for funcea in Functions(SegStart(ea)):
 		funcname = get_unified_funcname(funcea)
 		func = get_func(funcea)
 		print i
 		i += 1
 		try:
 			icfg = cfg.getCfg(func)
 			func_cfglist[funcname] = icfg
 		except:
 			pass
 	return func_cfglist
 def get_func_cfg_sequences(func_cfglist):
 	func_cfg_seqlist = {}
 	for funcname in func_cfglist:
 		func_cfg_seqlist[funcname] = {}
 		cfg = func_cfglist[funcname][0]
 		for start, end in cfg:
 			codesq = get_sequences(start, end)
 			func_cfg_seqlist[funcname][(start,end)] = codesq
 	return func_cfg_seqlist
 def get_sequences(start, end):
 	seq = []
 	inst_addr = start
 	while inst_addr <= end:
 		opcode = GetMnem(inst_addr)
 		seq.append(opcode)
 		inst_addr = NextHead(inst_addr)
 	return seq
 def get_stack_arg(func_addr):
 	print func_addr
 	args = []
 	stack = GetFrame(func_addr)
 	if not stack:
 			return []
 	firstM = GetFirstMember(stack)
 	lastM = GetLastMember(stack)
 	i = firstM
 	while i <=lastM:
 		mName = GetMemberName(stack,i)
 		mSize = GetMemberSize(stack,i)
 		if mSize:
 				i = i + mSize
 		else:
 				i = i+4
 		if mName not in args and mName and ' s' not in mName and ' r' not in mName:
 			args.append(mName)
 	return args
 		#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
 def processExternalSegs():
 	funcdata = {}
 	datafunc = {}
 	for n in xrange(idaapi.get_segm_qty()):
 		seg = idaapi.getnseg(n)
 		ea = seg.startEA
 		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
 		if segtype in [idc.SEG_XTRN]:
 			start = idc.SegStart(ea)
 			end = idc.SegEnd(ea)
 			cur = start
 			while cur <= end:
 				name = get_unified_funcname(cur)
 				funcdata[name] = hex(cur)
 				cur = NextHead(cur)
 	return funcdata
 def processpltSegs():
 	funcdata = {}
 	datafunc = {}
 	for n in xrange(idaapi.get_segm_qty()):
 		seg = idaapi.getnseg(n)
 		ea = seg.startEA
 		segname = SegName(ea)
 		if segname in ['.plt', 'extern', '.MIPS.stubs']:
 			start = seg.startEA
 			end = seg.endEA
 			cur = start
 			while cur < end:
 				name = get_unified_funcname(cur)
 				funcdata[name] = hex(cur)
 				datafunc[cur]= name
 				cur = NextHead(cur)
 	return funcdata, datafunc
 def processDataSegs():
 	funcdata = {}
 	datafunc = {}
 	for n in xrange(idaapi.get_segm_qty()):
 		seg = idaapi.getnseg(n)
 		ea = seg.startEA
 		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
 		if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
 			start = idc.SegStart(ea)
 			end = idc.SegEnd(ea)
 			cur = start
 			while cur <= end:
 				refs = [v for v in DataRefsTo(cur)]
 				for fea in refs:
 					name = get_unified_funcname(fea)
 					if len(name)== 0:
 						continue
 					if name not in funcdata:
 						funcdata[name] = [cur]
 					else:
 						funcdata[name].append(cur)
 					if cur not in datafunc:
 						datafunc[cur] = [name]
 					else:
 						datafunc[cur].append(name)
 				cur = NextHead(cur)
 	return funcdata, datafunc
 def obtainDataRefs(callgraph):
 	datarefs = {}
 	funcdata, datafunc = processDataSegs()
 	for node in callgraph:
 		if node in funcdata:
 			datas = funcdata[node]
 			for dd in datas:
 				refs = datafunc[dd]
 				refs = list(set(refs))
 				if node in datarefs:
 					print refs
 					datarefs[node] += refs
 					datarefs[node] = list(set(datarefs[node]))
 				else:
 					datarefs[node] = refs
 	return datarefs
--- a/raw-feature-extractor/graph_analysis_ida.py
+++ b/raw-feature-extractor/graph_analysis_ida.py
@ -0,0 +1,257 @@
 from idautils import *
 from idaapi import *
 from idc import *
 def getfunc_consts(func):
 	strings = []
 	consts = []
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	for bl in blocks:
 		strs, conts = getBBconsts(bl)
 		strings += strs
 		consts += conts
 	return strings, consts
 def getConst(ea, offset):
 	strings = []
 	consts = []
 	optype1 = GetOpType(ea, offset)
 	if optype1 == idaapi.o_imm:
 		imm_value = GetOperandValue(ea, offset)
 		if 0<= imm_value <= 10:
 			consts.append(imm_value)
 		else:
 			if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value):
 				str_value = GetString(imm_value)
 				if str_value is None:
 					str_value = GetString(imm_value+0x40000)
 					if str_value is None:
 						consts.append(imm_value)
 					else:
 						re = all(40 <= ord(c) < 128 for c in str_value)
 						if re:
 							strings.append(str_value)
 						else:
 							consts.append(imm_value)
 				else:
 					re = all(40 <= ord(c) < 128 for c in str_value)
 					if re:
 						strings.append(str_value)
 					else:
 						consts.append(imm_value)
 			else:
 				consts.append(imm_value)
 	return strings, consts
 def getBBconsts(bl):
 	strings = []
 	consts = []
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		if opcode in ['la','jalr','call', 'jal']:
 			inst_addr = NextHead(inst_addr)
 			continue
 		strings_src, consts_src = getConst(inst_addr, 0)
 		strings_dst, consts_dst = getConst(inst_addr, 1)
 		strings += strings_src
 		strings += strings_dst
 		consts += consts_src
 		consts += consts_dst
 		try:
 			strings_dst, consts_dst = getConst(inst_addr, 2)
 			consts += consts_dst
 			strings += strings_dst
 		except:
 			pass
 		inst_addr = NextHead(inst_addr)
 	return strings, consts
 def getFuncCalls(func):
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	sumcalls = 0
 	for bl in blocks:
 		callnum = calCalls(bl)
 		sumcalls += callnum
 	return sumcalls
 def getLogicInsts(func):
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	sumcalls = 0
 	for bl in blocks:
 		callnum = calLogicInstructions(bl)
 		sumcalls += callnum
 	return sumcalls
 def getTransferInsts(func):
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	sumcalls = 0
 	for bl in blocks:
 		callnum = calTransferIns(bl)
 		sumcalls += callnum
 	return sumcalls
 def getIntrs(func):
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	sumcalls = 0
 	for bl in blocks:
 		callnum = calInsts(bl)
 		sumcalls += callnum
 	return sumcalls	
 def getLocalVariables(func):
 	args_num = get_stackVariables(func.startEA)
 	return args_num
 def getBasicBlocks(func):
 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 	return len(blocks)
 def getIncommingCalls(func):
 	refs = CodeRefsTo(func.startEA, 0)
 	re = len([v for v in refs])
 	return re
 def get_stackVariables(func_addr):
    #print func_addr
    args = []
    stack = GetFrame(func_addr)
    if not stack:
            return 0
    firstM = GetFirstMember(stack)
    lastM = GetLastMember(stack)
    i = firstM
    while i <=lastM:
        mName = GetMemberName(stack,i)
        mSize = GetMemberSize(stack,i)
        if mSize:
                i = i + mSize
        else:
                i = i+4
        if mName not in args and mName and 'var_' in mName:
            args.append(mName)
    return len(args)
 def calArithmeticIns(bl):
 	x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
 	mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
 	calls = {}
 	calls.update(x86_AI)
 	calls.update(mips_AI)
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		if opcode in calls:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def calCalls(bl):
 	calls = {'call':1, 'jal':1, 'jalr':1}
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		if opcode in calls:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def calInsts(bl):
 	start = bl[0]
 	end = bl[1]
 	ea = start
 	num = 0
 	while ea < end:
 		num += 1
 		ea = NextHead(ea)
 	return num
 def calLogicInstructions(bl):
 	x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1}
 	mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1}
 	calls = {}
 	calls.update(x86_LI)
 	calls.update(mips_LI)
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		if opcode in calls:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def calSconstants(bl):
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		if opcode in calls:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def calNconstants(bl):
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		optype1 = GetOpType(inst_addr, 0)
 		optype2 = GetOpType(inst_addr, 1)
 		if optype1 == 5 or optype2 == 5:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
 def retrieveExterns(bl, ea_externs):
 	externs = []
 	start = bl[0]
 	end = bl[1]
 	inst_addr = start
 	while inst_addr < end:
 		refs = CodeRefsFrom(inst_addr, 1)
 		try:
 			ea = [v for v in refs if v in ea_externs][0]
 			externs.append(ea_externs[ea])
 		except:
 			pass
 		inst_addr = NextHead(inst_addr)
 	return externs
 def calTransferIns(bl):
 	x86_TI = {'jmp':1, 'jz':1, 'jnz':1, 'js':1, 'je':1, 'jne':1, 'jg':1, 'jle':1, 'jge':1, 'ja':1, 'jnc':1, 'call':1}
 	mips_TI = {'beq':1, 'bne':1, 'bgtz':1, "bltz":1, "bgez":1, "blez":1, 'j':1, 'jal':1, 'jr':1, 'jalr':1}
 	arm_TI = {'MVN':1, "MOV":1}
 	calls = {}
 	calls.update(x86_TI)
 	calls.update(mips_TI)
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
 	inst_addr = start
 	while inst_addr < end:
 		opcode = GetMnem(inst_addr)
 		re = [v for v in calls if opcode in v]
 		if len(re) > 0:
 			invoke_num += 1
 		inst_addr = NextHead(inst_addr)
 	return invoke_num
--- a/raw-feature-extractor/graph_property.py
+++ b/raw-feature-extractor/graph_property.py
@ -0,0 +1,24 @@
 import networkx as nx
 import pdb
 def betweeness(g):
 	#pdb.set_trace()
 	betweenness = nx.betweenness_centrality(g)
 	return betweenness
 def eigenvector(g):
 	centrality = nx.eigenvector_centrality(g)
 	return centrality
 def closeness_centrality(g):
 	closeness = nx.closeness_centrality(g)
 	return closeness
 def retrieveGP(g):
 	bf = betweeness(g)
 	#close = closeness_centrality(g)
 	#bf_sim = 
 	#close_sim = 
 	x = sorted(bf.values())
 	value = sum(x)/len(x)
 	return round(value,5)
--- a/raw-feature-extractor/preprocessing_ida.py
+++ b/raw-feature-extractor/preprocessing_ida.py
@ -0,0 +1,27 @@
 from func import *
 from raw_graphs import *
 from idc import *
 import os
 import argparse
 def parse_command():
 	parser = argparse.ArgumentParser(description='Process some integers.')
 	parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file")
 	args = parser.parse_args()
 	return args
 if __name__ == '__main__':
 	args = parse_command()
 	path = args.path
 	analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
 	analysis_flags &= ~idc.AF_IMMOFF
 	# turn off "automatically make offset" heuristic
 	idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
 	idaapi.autoWait()
 	cfgs = get_func_cfgs_c(FirstSeg())
 	binary_name = idc.GetInputFile() + '.ida'
 	fullpath = os.path.join(path, binary_name)
 	pickle.dump(cfgs, open(fullpath,'w'))
 	print binary_name
 	idc.Exit(0)
--- a/raw-feature-extractor/raw_graphs.py
+++ b/raw-feature-extractor/raw_graphs.py
@ -0,0 +1,286 @@
 import itertools
 import sys
 sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
 import networkx as nx
 #import numpy as np
 from subprocess import Popen, PIPE
 import pdb
 import os
 import re,mmap
 #from graph_edit_new import *
 class raw_graph:
 	def __init__(self, funcname, g, func_f):
 		self.funcname = funcname
 		self.old_g = g[0]
 		self.g = nx.DiGraph()
 		self.entry = g[1]
 		self.fun_features = func_f
 		self.attributing()
 	def __len__(self):
 		return len(self.g)
 	def attributing(self):
 		self.obtainOffsprings(self.old_g)
 		for node in self.old_g:
 			fvector = self.retrieveVec(node, self.old_g)
 			self.g.add_node(node)
 			self.g.node[node]['v'] = fvector
 		for edge in self.old_g.edges():
 			node1 = edge[0]
 			node2 = edge[1]
 			self.g.add_edge(node1, node2)
 	def obtainOffsprings(self,g):
 		nodes = g.nodes()
 		for node in nodes:
 			offsprings = {}
 			self.getOffsprings(g, node, offsprings)
 			g.node[node]['offs'] = len(offsprings)
 		return g
 	def getOffsprings(self, g, node, offsprings):
 		node_offs = 0
 		sucs = g.successors(node)
 		for suc in sucs:
 			if suc not in offsprings:
 				offsprings[suc] = 1
 				self.getOffsprings(g, suc, offsprings)
 	def retrieveVec(self, id_, g):
 		feature_vec = []
 		#numC0
 		numc = g.node[id_]['consts']
 		feature_vec.append(numc)
 		#nums1
 		nums = g.node[id_]['strings']
 		feature_vec.append(nums)
 		#offsprings2
 		offs = g.node[id_]['offs']
 		feature_vec.append(offs)
 		#numAs3
 		numAs = g.node[id_]['numAs']
 		feature_vec.append(numAs)
 		# of calls4
 		calls = g.node[id_]['numCalls']
 		feature_vec.append(calls)
 		# of insts5
 		insts = g.node[id_]['numIns']
 		feature_vec.append(insts)
 		# of LIs6
 		insts = g.node[id_]['numLIs']
 		feature_vec.append(insts)
 		# of TIs7
 		insts = g.node[id_]['numTIs']
 		feature_vec.append(insts)	
 		return feature_vec
 	def enumerating(self, n):
 		subgs = []
 		#pdb.set_trace()
 		for sub_nodes in itertools.combinations(self.g.nodes(), n):
 		    subg = self.g.subgraph(sub_nodes)
 		    u_subg = subg.to_undirected()
 		    if nx.is_connected(u_subg):
 		        subgs.append(subg)
 		return subgs
 	def genMotifs(self, n):
 		motifs = {}
 		subgs = enumerating(n)
 		for subg in subgs:
 			if len(motifs) == 0:
 				motifs[subg] = [subg]
 			else:
 				nomatch = True
 				for mt in motifs:
 					if nx.is_isomorphic(mt, subg):
 						motifs[mt].append(subg)
 						nomatch = False
 				if nomatch:
 					motifs[subg] = [subg]
 		return motifs
 	def enumerating_efficient(self, n):
 		#pdb.set_trace()
 		if len(self.g) >= 200:
 			return []
 		with open('/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt','wb') as f:
 			nx.write_edgelist(self.g,f,data=False)
 		#pdb.set_trace()
 		process = Popen(["/home/qian/workspace/FANMOD-command_line-source/executables/./fanmod_command_line_linux", str(n), "100000", "1", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt", "1", "0", "0", "2", "0", "0", "0", "1000", "3", "3", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt", "0", "1"], stdout=PIPE, stderr=PIPE)
 		stdout, stderr = process.communicate()
 		if process.returncode >= 0:
 		#os.system("/home/qian/software/FANMOD-command_line-source/executables/./fanmod_command_line_linux " +str(n) + " 100000 1 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt 1 0 0 2 0 0 0 1000 3 3 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt 0 1")
 		#pdb.set_trace()
 			#pdb.set_trace()
 			subgs = self.parseOutput("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump", n)
 			#pdb.set_trace()
 			os.remove("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump")
 			return subgs
 		return []
 	def parseOutput(self, path, n):
 		pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
 		subgraphs = []
 		with open(path,'r') as f:
 			data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
 			mo = re.findall(pattern, data)
 			if mo:
 				results = [map(int, v.split(',')[1:]) for v in mo]
 				subgraphs = self.createGraphDirectly(results)
 		return subgraphs
 	def parseOutputByconditions(self, path, n):
 		pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
 		subgraphs = []
 		with open(path,'r') as f:
 			data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
 			mo = re.findall(pattern, data)
 			if mo:
 				results = [map(int, v.split(',')[1:]) for v in mo]
 				subgraphs = self.create_Graphbycondition_Directly(results)
 		return subgraphs
 	def create_Graphbycondition_Directly(self, results):
 		subgs = []
 		for indexes in results:
 			tg = template_graph()
 			subg = self.g.subgraph(indexes)
 			tg.updateG(subg)
 			subgs.append(tg)
 			del tg
 		return subgs
 	def createGraphDirectly(self, results):
 		#pdb.set_trace()
 		#subgs = [self.g.subgraph(indexes) for indexes in results]
 		subgs = []
 		for indexes in results:
 			tg = template_graph()
 			subg = self.g.subgraph(indexes)
 			tg.updateG(subg)
 			subgs.append(tg)
 			del tg
 		return subgs
 	def createGraph(self, results, n):
 		binary_value = int(results[0],2)
 		indexes = [int(v) for v in results[1:]]
 		fang = self.createG(results[0], n)
 		if fang:
 			tg = template_graph(binary_value)
 			tg.updateG(fang, indexes, self.g)
 			return tg
 		pdb.set_trace()
 		print "there is g which is none"
 	def createG(self, binary_str, n):
 		g = nx.DiGraph()
 		l = [int(v) for v in binary_str]
 		#pdb.set_trace()
 		shape = (n, n)
 		data = np.array(l)
 		ad_matrix = data.reshape(shape)
 		for i in xrange(n):
 			for j in xrange(n):
 				if ad_matrix[i][j] == 1:
 					g.add_edge(i, j)
 		return g
 class raw_graphs:
 	def __init__(self, binary_name):
 		self.binary_name = binary_name
 		self.raw_graph_list = []
 	def append(self, raw_g):
 		self.raw_graph_list.append(raw_g)
 	def __len__(self):
 		return len(self.raw_graph_list)
 class graphlets:
 	def __init__(self, funcname):
 		self.funcname = funcname
 		self.graphlets_list = []
 		self.binary_name = None
 	def updateBN(self, binary_name):
 		self.binary_name = binary_name
 	def append(self, subg):
 		self.graphlets_list.append(subg)
 	def appendSet(self, subgs):
 		self.graphlets_list += subgs
 	def __len__(self):
 		return len(self.graphlets_list)
 class template_graph:
 	def __init__(self, value=None):
 		self.value = value
 		self.g = None
 	def updateG(self,g):
 		self.g = g
 	#def updateIndexes(self, indexes):
 	#	self.indexes = indexes
 	#def updateAttributes(self, pg, indexes, maing):
 	#	for id_ in xrange(len(indexes)):
 	#		index = indexes[id_]
 	#		gnode = self.findNode(index, maing)
 	#		self.g.node[gnode] = pg.node[index]
 class template_graphs:
 	def __init__(self, size):
 		self.size = size
 		self.gs = []
 		self.bit_len = None
 	def enumeratingAll(self):
 		subgs = []
 		binary_value = self.genBinValue()
 		for i in xrange(binary_value):
 			if i == 0 :
 				continue
 			g = self.createG(i)
 			if g:
 				tg = template_graph(i)
 				tg.updateG(g)
 				self.gs.append(tg)
 	def genBinValue(self):
 		n = self.size
 		self.bit_len = n*n
 		return 2**(self.bit_len)
 	def createG(self, i):
 		g = nx.DiGraph()
 		l = self.genArray(i)
 		#pdb.set_trace()
 		shape = (self.size, self.size)
 		data = np.array(l)
 		ad_matrix = data.reshape(shape)
 		for i in xrange(self.size):
 			for j in xrange(self.size):
 				if ad_matrix[i][j] == 1:
 					g.add_edge(i, j)
 		u_g = g.to_undirected()
 		if len(g) == self.size and nx.is_connected(u_g):
 			return g
 		return False
 	def genArray(self, i):
 		l = [int(x) for x in bin(i)[2:]]
 		x = [0 for v in xrange(self.bit_len - len(l))]
 		return x + l
--- a/search-engine/db.py
+++ b/search-engine/db.py
@ -0,0 +1,356 @@
 import cPickle as pickle 
 from search import *
 from nearpy import Engine
 from nearpy.hashes import RandomDiscretizedProjections
 from nearpy.filters import NearestFilter, UniqueFilter
 from nearpy.distances import EuclideanDistance
 from nearpy.distances import CosineDistance
 from nearpy.hashes import RandomBinaryProjections
 from nearpy.experiments import DistanceRatioExperiment
 from redis import Redis
 from nearpy.storage import RedisStorage
 from feature import *
 import numpy as np
 import os
 import pdb
 import argparse
 import time
 import numpy as np
 from refactoring import *
 import pymongo
 from pymongo import MongoClient
 def initDB():
 	client = MongoClient()
 	client = MongoClient('localhost', 27017)
 	client = MongoClient('mongodb://localhost:27017/')
 	db = client.test_database
 	db = client['iot-encoding']
 	return db
 db = initDB()
 posts = db.posts
 class db:
 	def __init__(self):
 		self.feature_list = {}
 		self.engine = None
 	def loadHashmap(self, feature_size, result_n):
 		# Create redis storage adapter
 		redis_object = Redis(host='localhost', port=6379, db=0)
 		redis_storage = RedisStorage(redis_object)
 		pdb.set_trace()
 		try:
 			# Get hash config from redis
 			config = redis_storage.load_hash_configuration('test')
 			# Config is existing, create hash with None parameters
 			lshash = RandomBinaryProjections(None, None)
 			# Apply configuration loaded from redis
 			lshash.apply_config(config)
 		except:
 			# Config is not existing, create hash from scratch, with 10 projections
 			lshash = RandomBinaryProjections('test', 0)
 		# Create engine for feature space of 100 dimensions and use our hash.
 		# This will set the dimension of the lshash only the first time, not when
 		# using the configuration loaded from redis. Use redis storage to store
 		# buckets.
 		nearest = NearestFilter(1000)
 		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
 		pdb.set_trace()
 		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
 		# Do some stuff like indexing or querying with the engine...
 		# Finally store hash configuration in redis for later use
 		redis_storage.store_hash_configuration(lshash)
 	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
 		if fvector is None:
 			return
 		#ftuple = tuple([fvector])
 		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
 	def batch_appendDB(self, binary_name, features, firmware_name=""):
 		for funcname in features:
 			feature = features[funcname]
 			#pdb.set_trace()
 			self.appendToDB(binary_name, funcname, feature, firmware_name)
 	def batch_appendDBbyDir(self, base_dir):
 		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
 		i = 0
 		for v in cursor:
 			print i
 			i+=1
 			binary_name = v['binary_name']
 			funcname = v['func_name']
 			firmware_name = v['firmware_name']
 			feature = v['fvector']
 			self.appendToDB(binary_name, funcname, feature, firmware_name)
 	def batch_appendDBbyDir1(self, base_dir):
 		image_dir = os.path.join(base_dir, "image")
 		firmware_featrues={}
 		bnum = 0
 		fnum = 0
 		i  = 0
 		pdb.set_trace()
 		for firmware_name in os.listdir(image_dir):
 			print firmware_name
 			firmware_featrues[firmware_name] = {}
 			firmware_dir = os.path.join(image_dir, firmware_name)
 			for binary_name in os.listdir(firmware_dir):
 				if binary_name.endswith(".features"):
 					bnum += 1
 					featrues_dir = os.path.join(firmware_dir, binary_name)
 					featrues = pickle.load(open(featrues_dir, "r"))
 					for funcname in featrues:
 						fnum +=1
 						#pdb.set_trace()
 						feature = featrues[funcname]
 						self.appendToDB(binary_name, funcname, feature, firmware_name)
 					del featrues
 		print("bnum ", bnum)
 		print("fnum ", fnum)
 	def dump(self, base_dir):
 		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
 		pickle.dump(self.feature_list, open(db_dir, 'w'))
 		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
 		pickle.dump(self.engine, open(db_dir, 'w'))
 	def loadDB(self, base_dir):
 		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
 		self.feature_list = pickle.load(open(db_dir, 'r'))
 		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
 		self.engine = pickle.load(open(db_dir, 'r'))
 	def findF(self, binary_name, funcname):
 		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
 		return x[0]
 def retrieveFeaturesByDir(n, base_dir):
 	firmware_featrues={}
 	i = 0
 	for firmware_name in os.listdir(base_dir):
 		if firmware_name.endWith(".features"):
 			firmware_featrues[firmware_name] = {}
 			firmware_dir = os.path.join(base_dir, firmware_name)
 			if i > 0:
 				break
 			i += 1
 			pdb.set_trace()
 			for binary_name in os.listdir(firmware_dir):
 				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
 				featrues = pickle.load(open(featrues_dir, "r"))
 				for funcname in featrues:
 					feature = featrues[funcname]
 					self.appendToDB(firmware_name, binary_name, funcname, feature)
 				del featrues
 def retrieveFeatures(n, base_dir, filename, funcs):
 	feature_dic = {}
 	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	#featuresx = retrieveFeaturesx(filename)
 	for name in featrues:
 		#if name in funcs:
 		x = featrues[name] 
 		#+ featuresx[name]
 		feature_dic[name] = np.asarray(x)
 	return feature_dic
 def retrieveVuldb(base_input_dir):
 	vul_path = os.path.join(base_input_dir, "vul")
 	vul_db = pickle.load(open(vul_path, "r"))
 	return vul_db
 def retrieveFeaturesx(filename):
 	ida_input_dir = os.path.join("./data/", filename + ".features")
 	featuresx = pickle.load(open(ida_input_dir, "r"))
 	return featuresx
 def retrieveQueries(n, base_dir, filename1, featrues_src):
 	queries = {}
 	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	#featuresx = retrieveFeaturesx(filename1)
 	for name in featrues:
 		#if name in featrues_src:
 		x = featrues[name] 
 		#+ featuresx[name]
 		queries[name] = np.asarray(x)
 	return queries
 def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
 	queries = {}
 	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	for name in featrues:
 		#del featrues[name][5]
 		queries[name] = np.asarray(featrues[name])
 	return queries
 def retrieveQuery(n, base_dir, filename, funcname):
 	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	f = [featrues[v] for v in featrues if funcname in v ][0]
 	return np.asarray(f)
 def parse_command():
 	parser = argparse.ArgumentParser(description='Process some integers.')
 	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
 	parser.add_argument('--output_dir', type=str, help="output dir")
 	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
 	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
 	parser.add_argument("--size", type=int, help="the size of each graphlet")
 	#parser.add_argument("--size", type=int, help="the size of each graphlet")
 	args = parser.parse_args()
 	return args
 def loadFuncs(path):
 	funcs = {}
 	x86_dir = os.path.join(path, "func_candid")
 	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
 	fp = open(x86_dir,"r")
 	for line in fp:
 		items = line.split("\n")
 		funcname = items[0]
 		funcs[funcname] = 1
 	return funcs
 def dump(path, featrues, queries):
 	fp = open(path + "/" + "matrix", 'w')
 	for name in featrues:
 		row = []
 		row.append("x86")
 		row.append(name)
 		row += featrues[name]
 		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
 	for name in queries:
 		row = []
 		row.append("mips")
 		row.append(name)
 		row += queries[name]
 		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
 	fp.close()
 def queryBytwo(base_input_dir, filename1, filename2, n):
 	threthold = 50
 	db_instance = db()
 	funcs = loadFuncs(base_input_dir)
 	db_instance.loadHashmap(n, 50000)
 	#pdb.set_trace()
 	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
 	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
 	#queries = refactoring(queries, featrues)
 	vul_db = retrieveVuldb(base_input_dir)
 	pdb.set_trace()
 	#dump(base_input_dir, featrues, queries)
 	#start = time.time()
 	#db_instance.batch_appendDBbyDir(base_input_dir)
 	#end = time.time()
 	#total = end - start
 	#print total
 	db_instance.batch_appendDB(filename1, featrues)
 	pdb.set_trace()
 	ranks = []
 	times = []
 	for threthold in xrange(1, 210, 10):
 		hit = []
 		i = 0
 		for name in queries:
 			#print i 
 			i += 1
 			'''
 			if i == 1000:
 				print (sum(times)/len(times))
 				pdb.set_trace()
 				print "s"
 			'''
 			#if name not in vul_db['openssl']:
 			#	continue
 			if name not in featrues:
 				continue
 			#pdb.set_trace()
 			query = queries[name]
 			#start = time.time()
 			x = db_instance.engine.neighbours(query)
 			#end = time.time()
 			#total = end - start
 			#times.append(total)
 			#print total
 			#pdb.set_trace()
 			try:
 				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
 				ranks.append((name, rank))
 				if rank <= threthold:
 					hit.append(1)
 				else:
 					hit.append(0)
 			except:
 				#pdb.set_trace()
 				hit.append(0)
 				pass
 		#pdb.set_trace()
 		acc = sum(hit) * 1.0 / len(hit)
 		print acc
 def queryAll(base_dir, firmware_name, filename1, n):
 	threthold = 155
 	db_instance = db()
 	db_instance.loadHashmap(n, 50000)
 	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
 	start = time.time()
 	pdb.set_trace()
 	db_instance.batch_appendDBbyDir(n, base_dir)
 	end = time.time()
 	dur = end - start
 	print dur
 	pdb.set_trace()
 	hit = []
 	i = 0
 	times = []
 	for name in queries:
 		print i 
 		i += 1
 		query = queries[name]
 		start = time.clock()
 		x = db_instance.engine.neighbours(query)
 		end = time.clock()
 		dur = end - start
 		times.append(dur)
 		#pdb.set_trace()
 		try:
 			rank = [v for v in xrange(len(x)) if name in x[v][1]]
 			if len(rank) > 1:
 				pdb.set_trace()
 				print "stop"
 			if rank[0] <= threthold:
 				hit.append(1)
 			else:
 				hit.append(0)
 		except:
 			hit.append(0)
 	acc = sum(hit) * 1.0 / len(hit)
 	mean = np.mean(times)
 	std =  np.std(times)
 	#pdb.set_trace()
 	print acc
 if __name__ == "__main__":
 	args = parse_command()
 	base_dir = args.base_input_dir
 	filename1 = args.filename1
 	filename2 = args.filename2
 	n = args.size
 	pdb.set_trace()
 	queryBytwo(base_dir, filename1, filename2, n)