This commit is contained in:
TinyCaviar 2023-08-03 10:03:02 +08:00
parent 8dad81779b
commit badd4eada6
40 changed files with 17037 additions and 11623 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/../../../../../:\hkn\project_folder\Gencoding3\.idea/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

17
.idea/Gencoding3.iml Normal file
View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 2.7 (Gencoding)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="pytest" />
</component>
</module>

View File

@ -0,0 +1,24 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyChainedComparisonsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoreConstantInTheMiddle" value="true" />
</inspection_tool>
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="E501" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
<option value="N802" />
<option value="N803" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7 (Gencoding)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Gencoding3.iml" filepath="$PROJECT_DIR$/.idea/Gencoding3.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,623 @@
{
"function_edges": [
[
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1
],
[
0,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26
]
], // FCGindexindex
"acfg_list": [ // data.raw_graph_list
{ // CFG data.raw_graph_list[a]
"block_number": 3, // CFG data.raw_graph_list[a].g.__len__()
"block_edges": [
[
0,
0,
1,
1
],
[
0,
2,
0,
2
]
], // 0 data.raw_graph_list[a].g.edges
"block_features": [ //
[
0,
2,
1,
0,
7,
0,
1,
1,
4,
0,
0
], // 11//////////
[
0,
2,
0,
0,
3,
1,
0,
1,
0,
0,
0
],
[
1,
0,
0,
0,
1,
0,
0,
0,
0,
1,
0
]
]
},
{
"block_number": 29, // CFG
"block_edges": [
[
0,
1,
1,
2,
2,
3,
3,
4,
5,
6,
6,
7,
7,
8,
8,
9,
9,
10,
10,
11,
12,
12,
13,
14,
14,
15,
16,
17,
18,
19,
19,
20,
20,
21,
21,
23,
24,
24,
26,
26,
27,
28
],
[
16,
0,
2,
0,
4,
1,
3,
3,
3,
25,
15,
8,
6,
6,
7,
28,
12,
9,
23,
16,
25,
11,
21,
17,
13,
19,
22,
14,
19,
18,
27,
24,
23,
26,
21,
22,
25,
10,
25,
5,
14,
8
]
],
"block_features": [
[
8,
2,
1,
5,
36,
0,
6,
0,
2,
0,
0
],
[
0,
7,
0,
0,
3,
0,
1,
1,
1,
0,
0
],
[
0,
7,
0,
0,
2,
0,
1,
1,
0,
0,
0
],
[
0,
7,
0,
1,
8,
1,
2,
0,
0,
0,
0
],
[
0,
7,
1,
0,
2,
0,
1,
0,
0,
0,
0
],
[
0,
7,
0,
0,
1,
0,
0,
0,
1,
0,
0
],
[
1,
18,
0,
1,
9,
0,
2,
1,
1,
0,
0
],
[
1,
21,
1,
0,
3,
0,
1,
1,
0,
0,
0
],
[
0,
21,
0,
1,
4,
1,
2,
0,
0,
0,
0
],
[
0,
24,
0,
2,
12,
1,
3,
0,
0,
0,
0
],
[
1,
26,
0,
3,
16,
0,
4,
1,
4,
0,
0
],
[
1,
2,
0,
5,
22,
0,
5,
0,
1,
0,
0
],
[
5,
4,
1,
3,
21,
0,
4,
1,
3,
0,
0
],
[
4,
11,
0,
2,
17,
1,
2,
0,
1,
0,
0
],
[
2,
14,
0,
1,
12,
0,
2,
1,
1,
0,
0
],
[
3,
17,
0,
0,
10,
0,
1,
0,
1,
0,
0
],
[
1,
1,
0,
1,
5,
0,
2,
0,
0,
0,
0
],
[
0,
14,
0,
0,
1,
0,
0,
0,
0,
0,
0
],
[
3,
17,
0,
0,
7,
0,
0,
0,
0,
0,
0
],
[
0,
17,
0,
1,
5,
0,
2,
1,
1,
0,
0
],
[
2,
28,
1,
1,
11,
1,
2,
1,
1,
0,
0
],
[
0,
11,
0,
1,
8,
1,
2,
0,
0,
0,
0
],
[
0,
0,
0,
1,
1,
0,
1,
0,
0,
0,
0
],
[
1,
1,
0,
0,
1,
0,
0,
0,
0,
0,
0
],
[
12,
27,
1,
7,
41,
0,
8,
1,
6,
0,
0
],
[
0,
0,
1,
0,
7,
1,
0,
0,
0,
1,
0
],
[
2,
9,
0,
2,
17,
0,
3,
1,
3,
0,
0
],
[
2,
14,
0,
0,
5,
0,
1,
0,
4,
0,
0
],
[
1,
21,
4,
1,
13,
0,
2,
0,
5,
0,
0
]
]
}
],
"function_names": [ //
"sub_401000",
"start",
"GetTempPathW",
"GetFileSize",
"GetCurrentDirectoryW",
"DeleteFileW",
"CloseHandle",
"WriteFile",
"lstrcmpW",
"ReadFile",
"GetModuleHandleW",
"ExitProcess",
"HeapCreate",
"HeapAlloc",
"GetModuleFileNameW",
"CreateFileW",
"lstrlenW",
"ShellExecuteW",
"wsprintfW",
"HttpSendRequestW",
"InternetSetOptionW",
"InternetQueryOptionW",
"HttpOpenRequestW",
"HttpQueryInfoW",
"InternetReadFile",
"InternetConnectW",
"InternetOpenW"
], //
"hash": "316ebb797d5196020eee013cfe771671fff4da8859adc9f385f52a74e82f4e55", // md5
"function_number": 27 //
}

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -374,8 +374,8 @@ def DecodePreviousInstruction(ea):
@param ea: address to decode @param ea: address to decode
@return: None or a new insn_t instance @return: None or a new insn_t instance
""" """
prev_addr = idaapi.decode_prev_insn(ea) inslen = idaapi.decode_prev_insn(ea)
if prev_addr == idaapi.BADADDR: if inslen == 0:
return None return None
return idaapi.cmd.copy() return idaapi.cmd.copy()
@ -462,8 +462,7 @@ def GetInputFileMD5():
class Strings(object): class Strings(object):
""" """
Allows iterating over the string list. The set of strings will not be modified. Returns the string list.
, unless asked explicitly at setup()-time..
Example: Example:
s = Strings() s = Strings()
@ -484,34 +483,8 @@ class Strings(object):
self.length = si.length self.length = si.length
"""string length""" """string length"""
def is_1_byte_encoding(self):
return not self.is_2_bytes_encoding() and not self.is_4_bytes_encoding()
def is_2_bytes_encoding(self):
return (self.type & 7) in [idaapi.ASCSTR_UTF16, idaapi.ASCSTR_ULEN2, idaapi.ASCSTR_ULEN4]
def is_4_bytes_encoding(self):
return (self.type & 7) == idaapi.ASCSTR_UTF32
def _toseq(self, as_unicode):
if self.is_2_bytes_encoding():
conv = idaapi.ACFOPT_UTF16
pyenc = "utf-16"
elif self.is_4_bytes_encoding():
conv = idaapi.ACFOPT_UTF8
pyenc = "utf-8"
else:
conv = idaapi.ACFOPT_ASCII
pyenc = 'ascii'
strbytes = idaapi.get_ascii_contents2(self.ea, self.length, self.type, conv)
return unicode(strbytes, pyenc, 'replace') if as_unicode else strbytes
def __str__(self): def __str__(self):
return self._toseq(False) return idc.GetString(self.ea, self.length, self.type)
def __unicode__(self):
return self._toseq(True)
STR_C = 0x0001 STR_C = 0x0001
"""C-style ASCII string""" """C-style ASCII string"""
@ -532,7 +505,8 @@ class Strings(object):
"""Clears the strings list cache""" """Clears the strings list cache"""
self.refresh(0, 0) # when ea1=ea2 the kernel will clear the cache self.refresh(0, 0) # when ea1=ea2 the kernel will clear the cache
def __init__(self, default_setup = False):
def __init__(self, default_setup = True):
""" """
Initializes the Strings enumeration helper class Initializes the Strings enumeration helper class
@ -541,11 +515,10 @@ class Strings(object):
self.size = 0 self.size = 0
if default_setup: if default_setup:
self.setup() self.setup()
else:
self.refresh()
self._si = idaapi.string_info_t() self._si = idaapi.string_info_t()
def refresh(self, ea1=None, ea2=None): def refresh(self, ea1=None, ea2=None):
"""Refreshes the strings list""" """Refreshes the strings list"""
if ea1 is None: if ea1 is None:

Binary file not shown.

View File

@ -1450,11 +1450,7 @@ def PatchByte(ea, value):
@param ea: linear address @param ea: linear address
@param value: new value of the byte @param value: new value of the byte
@return: 1 if the database has been modified, @return: 1 if successful, 0 if not
0 if either the debugger is running and the process' memory
has value 'value' at address 'ea',
or the debugger is not running, and the IDB
has value 'value' at address 'ea already.
""" """
return idaapi.patch_byte(ea, value) return idaapi.patch_byte(ea, value)
@ -1466,11 +1462,7 @@ def PatchWord(ea, value):
@param ea: linear address @param ea: linear address
@param value: new value of the word @param value: new value of the word
@return: 1 if the database has been modified, @return: 1 if successful, 0 if not
0 if either the debugger is running and the process' memory
has value 'value' at address 'ea',
or the debugger is not running, and the IDB
has value 'value' at address 'ea already.
""" """
return idaapi.patch_word(ea, value) return idaapi.patch_word(ea, value)
@ -1482,31 +1474,11 @@ def PatchDword(ea, value):
@param ea: linear address @param ea: linear address
@param value: new value of the double word @param value: new value of the double word
@return: 1 if the database has been modified, @return: 1 if successful, 0 if not
0 if either the debugger is running and the process' memory
has value 'value' at address 'ea',
or the debugger is not running, and the IDB
has value 'value' at address 'ea already.
""" """
return idaapi.patch_long(ea, value) return idaapi.patch_long(ea, value)
def PatchQword(ea, value):
"""
Change value of a quad word
@param ea: linear address
@param value: new value of the quad word
@return: 1 if the database has been modified,
0 if either the debugger is running and the process' memory
has value 'value' at address 'ea',
or the debugger is not running, and the IDB
has value 'value' at address 'ea already.
"""
return idaapi.patch_qword(ea, value)
def SetFlags(ea, flags): def SetFlags(ea, flags):
""" """
Set new value of flags Set new value of flags
@ -2316,13 +2288,13 @@ o_displ = idaapi.o_displ # Memory Reg [Base Reg + Index Reg + Displacemen
o_imm = idaapi.o_imm # Immediate Value value o_imm = idaapi.o_imm # Immediate Value value
o_far = idaapi.o_far # Immediate Far Address (CODE) addr o_far = idaapi.o_far # Immediate Far Address (CODE) addr
o_near = idaapi.o_near # Immediate Near Address (CODE) addr o_near = idaapi.o_near # Immediate Near Address (CODE) addr
o_idpspec0 = idaapi.o_idpspec0 # Processor specific type o_idpspec0 = idaapi.o_idpspec0 # IDP specific type
o_idpspec1 = idaapi.o_idpspec1 # Processor specific type o_idpspec1 = idaapi.o_idpspec1 # IDP specific type
o_idpspec2 = idaapi.o_idpspec2 # Processor specific type o_idpspec2 = idaapi.o_idpspec2 # IDP specific type
o_idpspec3 = idaapi.o_idpspec3 # Processor specific type o_idpspec3 = idaapi.o_idpspec3 # IDP specific type
o_idpspec4 = idaapi.o_idpspec4 # Processor specific type o_idpspec4 = idaapi.o_idpspec4 # IDP specific type
o_idpspec5 = idaapi.o_idpspec5 # Processor specific type o_idpspec5 = idaapi.o_idpspec5 # IDP specific type
# There can be more processor specific types o_last = idaapi.o_last # first unused type
# x86 # x86
o_trreg = idaapi.o_idpspec0 # trace register o_trreg = idaapi.o_idpspec0 # trace register
@ -2336,7 +2308,7 @@ o_xmmreg = idaapi.o_idpspec5 # xmm register
o_reglist = idaapi.o_idpspec1 # Register list (for LDM/STM) o_reglist = idaapi.o_idpspec1 # Register list (for LDM/STM)
o_creglist = idaapi.o_idpspec2 # Coprocessor register list (for CDP) o_creglist = idaapi.o_idpspec2 # Coprocessor register list (for CDP)
o_creg = idaapi.o_idpspec3 # Coprocessor register (for LDC/STC) o_creg = idaapi.o_idpspec3 # Coprocessor register (for LDC/STC)
o_fpreg_arm = idaapi.o_idpspec4 # Floating point register o_fpreg = idaapi.o_idpspec4 # Floating point register
o_fpreglist = idaapi.o_idpspec5 # Floating point register list o_fpreglist = idaapi.o_idpspec5 # Floating point register list
o_text = (idaapi.o_idpspec5+1) # Arbitrary text stored in the operand o_text = (idaapi.o_idpspec5+1) # Arbitrary text stored in the operand
@ -3144,20 +3116,6 @@ def Message(msg):
idaapi.msg(msg) idaapi.msg(msg)
def UMessage(msg):
"""
Display an UTF-8 string in the message window
The result of the stringification of the arguments
will be treated as an UTF-8 string.
@param msg: message to print (formatting is done in Python)
This function can be used to debug IDC scripts
"""
idaapi.umsg(msg)
def Warning(msg): def Warning(msg):
""" """
Display a message in a message box Display a message in a message box
@ -3632,7 +3590,7 @@ def GetSegmentAttr(segea, attr):
seg = idaapi.getseg(segea) seg = idaapi.getseg(segea)
assert seg, "could not find segment at 0x%x" % segea assert seg, "could not find segment at 0x%x" % segea
if attr in [ SEGATTR_ES, SEGATTR_CS, SEGATTR_SS, SEGATTR_DS, SEGATTR_FS, SEGATTR_GS ]: if attr in [ SEGATTR_ES, SEGATTR_CS, SEGATTR_SS, SEGATTR_DS, SEGATTR_FS, SEGATTR_GS ]:
return idaapi.get_defsr(seg, _SEGATTRMAP[attr][1]) return idaapi.get_defsr(seg, _SEGATTRMAP[attr])
else: else:
return _IDC_GetAttr(seg, _SEGATTRMAP, attr) return _IDC_GetAttr(seg, _SEGATTRMAP, attr)
@ -3651,7 +3609,7 @@ def SetSegmentAttr(segea, attr, value):
seg = idaapi.getseg(segea) seg = idaapi.getseg(segea)
assert seg, "could not find segment at 0x%x" % segea assert seg, "could not find segment at 0x%x" % segea
if attr in [ SEGATTR_ES, SEGATTR_CS, SEGATTR_SS, SEGATTR_DS, SEGATTR_FS, SEGATTR_GS ]: if attr in [ SEGATTR_ES, SEGATTR_CS, SEGATTR_SS, SEGATTR_DS, SEGATTR_FS, SEGATTR_GS ]:
idaapi.set_defsr(seg, _SEGATTRMAP[attr][1], value) idaapi.set_defsr(seg, _SEGATTRMAP[attr], value)
else: else:
_IDC_SetAttr(seg, _SEGATTRMAP, attr, value) _IDC_SetAttr(seg, _SEGATTRMAP, attr, value)
return seg.update() return seg.update()
@ -3995,11 +3953,7 @@ def SaveFile(filepath, pos, ea, size):
@return: 0 - error, 1 - ok @return: 0 - error, 1 - ok
""" """
if ( os.path.isfile(filepath) ):
of = idaapi.fopenM(filepath) of = idaapi.fopenM(filepath)
else:
of = idaapi.fopenWB(filepath)
if of: if of:
retval = idaapi.base2file(of, pos, ea, ea+size) retval = idaapi.base2file(of, pos, ea, ea+size)
@ -5588,23 +5542,6 @@ def SetMemberComment(sid, member_offset, comment, repeatable):
return idaapi.set_member_cmt(m, comment, repeatable) return idaapi.set_member_cmt(m, comment, repeatable)
def ExpandStruc(sid, offset, delta, recalc):
"""
Expand or shrink a structure type
@param id: structure type ID
@param offset: offset in the structure
@param delta: how many bytes to add or remove
@param recalc: recalculate the locations where the structure
type is used
@return: != 0 - ok
"""
s = idaapi.get_struc(sid)
if not s:
return 0
return idaapi.expand_struc(s, offset, delta, recalc)
def GetFchunkAttr(ea, attr): def GetFchunkAttr(ea, attr):
""" """
Get a function chunk attribute Get a function chunk attribute
@ -6980,16 +6917,16 @@ def ApplyType(ea, py_type, flags = TINFO_DEFINITE):
@return: Boolean @return: Boolean
""" """
if py_type is None: if py_type != None:
py_type = ""
if isinstance(py_type, basestring) and len(py_type) == 0:
pt = ("", "")
else:
if len(py_type) == 3: if len(py_type) == 3:
pt = py_type[1:] # skip name component pt = py_type[1:] # skip name component
else: else:
pt = py_type pt = py_type
return idaapi.apply_type(idaapi.cvar.idati, pt[0], pt[1], ea, flags) return idaapi.apply_type(idaapi.cvar.idati, pt[0], pt[1], ea, flags)
if idaapi.has_ti(ea):
idaapi.del_tinfo(ea)
return True
return False
def SetType(ea, newtype): def SetType(ea, newtype):
""" """
@ -7004,7 +6941,7 @@ def SetType(ea, newtype):
@return: 1-ok, 0-failed. @return: 1-ok, 0-failed.
""" """
if newtype is not '': if newtype is not '':
pt = ParseType(newtype, 1) # silent pt = ParseType(newtype, 0)
if pt is None: if pt is None:
# parsing failed # parsing failed
return None return None

Binary file not shown.

View File

@ -28,8 +28,9 @@ class IDAPythonStdOut:
Dummy file-like class that receives stout and stderr Dummy file-like class that receives stout and stderr
""" """
def write(self, text): def write(self, text):
# NB: in case 'text' is Unicode, msg() will decode it # Swap out the unprintable characters
# and call umsg() to print it text = text.decode('ascii', 'replace').encode('ascii', 'replace')
# Print to IDA message window
_idaapi.msg(text) _idaapi.msg(text)
def flush(self): def flush(self):

View File

@ -1,9 +1,3 @@
import copy
import networkx as nx
from idautils import *
from idaapi import *
from idc import *
import copy import copy
import networkx as nx import networkx as nx
from idautils import * from idautils import *
@ -99,11 +93,11 @@ def filtering(cfg):
bb_start = bb[0] bb_start = bb[0]
bb_end = bb[1] bb_end = bb[1]
re = remove(bb_start, bb_end) re = remove(bb_start, bb_end)
print bb_id, re, bb_start, bb_end print(bb_id, re, bb_start, bb_end)
if re: if re:
print re, bb_id print(re, bb_id)
rm_sets.append(bb_id) rm_sets.append(bb_id)
print rm_sets print(rm_sets)
for bb_id in rm_sets: for bb_id in rm_sets:
cfg.remove_node(bb_id) cfg.remove_node(bb_id)
@ -160,16 +154,16 @@ def attributingRe(cfg, externs_eas, ea_externs):
def attributing(cfg): def attributing(cfg):
ga = graph_analysis() ga = graph_analysis()
ga.gwithoffspring(cfg) ga.gwithoffspring(cfg)
print "finishing offspring" print("finishing offspring")
for node in cfg: for node in cfg:
stmt_num = getStmtNum(node) stmt_num = getStmtNum(node)
binary_value = getBinaryValue(node) binary_value = getBinaryValue(node)
cfg.node[node]['stmt_num'] = stmt_num cfg.node[node]['stmt_num'] = stmt_num
cfg.node[node]['binary_value'] = binary_value cfg.node[node]['binary_value'] = binary_value
ga.domChecking(cfg) ga.domChecking(cfg)
print "finishing domChecking" print("finishing domChecking")
ga.loopChecking(cfg) ga.loopChecking(cfg)
print "finishing loopChecking" print("finishing loopChecking")
def getStmtNum(node): def getStmtNum(node):
@ -190,17 +184,17 @@ def getBinaryValue(node):
for x in xrange((inst_addr - start)-1): for x in xrange((inst_addr - start)-1):
addr = start + x addr = start + x
y = GetOriginalByte(addr) y = GetOriginalByte(addr)
print value, addr, y print(value, addr, y)
value = value | y value = value | y
value = value << 8 value = value << 8
print value print(value)
addr = inst_addr - 1 addr = inst_addr - 1
y = GetOriginalByte(addr) y = GetOriginalByte(addr)
print value, addr, y print(value, addr, y)
value = value | y value = value | y
print node print(node)
print bin(value) print(bin(value))
return value return value

View File

@ -0,0 +1,60 @@
# coding=utf-8
import pickle as pk
import re
import json
import os
def convert():
# for workflow in range(0, 20):
workflow = 0
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
dot_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
for cfg in os.listdir(cfg_dir):
name = cfg[:-4] # 纯文件名,不带后缀
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
data = pk.load(cfg_file)
cfg_file.close()
# 打开dot文件获取fcg
raw_function_edges = []
with open(os.path.join(dot_path, name + '.dot'), 'r') as dot:
for line in dot:
if '->' in line:
raw_function_edges.append(re.findall(r'\b\d+\b', line))
# 为当前pe文件创建json对象
json_obj = {
'hash': data.binary_name[11:],
'function_number': data.raw_graph_list.__len__(),
'function_edges': [[d[0] for d in raw_function_edges], [d[1] for d in raw_function_edges]],
'acfg_list': [],
'function_names': []
}
# 读取pkl文件一个acfg由一个函数分解而来
for acfg in data.raw_graph_list:
# 这里2是因为Genius框架提取特征时将后代数量放在2
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
# 将后代数量的特征放入bb_features中
for i, f in enumerate(acfg.bb_features):
f.append(offspring[i])
acfg_item = {
'block_number': acfg.g.__len__(),
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
'block_features': acfg.bb_features
}
json_obj['acfg_list'].append(acfg_item)
json_obj['function_names'].append(acfg.funcname)
# 将结果写入json本地文件
result = json.dumps(json_obj)
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
out.write(result)
if __name__ == '__main__':
convert()

View File

@ -1,3 +1,4 @@
# coding=utf-8
# #
# Reference Lister # Reference Lister
# #
@ -6,12 +7,13 @@
# Implemented with the idautils module # Implemented with the idautils module
# #
import networkx as nx import networkx as nx
import cPickle as pickle
import pdb import pdb
from graph_analysis_ida import * from graph_analysis_ida import *
from graph_property import * from graph_property import *
#import wingdbstub
#wingdbstub.Ensure()
# import wingdbstub
# wingdbstub.Ensure()
def get_funcs(ea): def get_funcs(ea):
funcs = {} funcs = {}
@ -28,53 +30,81 @@ def get_funcs(ea):
funcs[funcname].append((start, end)) funcs[funcname].append((start, end))
return funcs return funcs
def get_funcs_for_discoverRe(ea):
features = {} # 似乎是没用的函数
for funcea in Functions(SegStart(ea)): # def get_funcs_for_discoverRe(ea):
funcname = GetFunctionName(funcea) # features = {}
print funcname # for funcea in Functions(SegStart(ea)):
func = get_func(funcea) # funcname = GetFunctionName(funcea)
feature = get_discoverRe_feature(func) # print(funcname)
features[funcname] = feature # func = get_func(funcea)
return features # feature = get_discoverRe_feature(func)
# features[funcname] = feature
# return features
# 获取所有bb的11维属性特征
# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
def get_bb_features(func):
bb_features = []
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
for bl in blocks:
calls = calCalls(bl)
transferIns = calTransferIns(bl)
mathematicsIns = calArithmeticIns(bl)
logicIns = calLogicInstructions(bl)
cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
totalIns = calInsts(bl)
consts = getBBconsts(bl)
stringOrIntConsts = len(consts[0]) + len(consts[1])
bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
interruptIns, declareIns, totalIns, stringOrIntConsts])
return bb_features
def get_discoverRe_feature(func, icfg): def get_discoverRe_feature(func, icfg):
start = func.startEA start = func.startEA
end = func.endEA end = func.endEA
features = [] features = []
FunctionCalls = getFuncCalls(func) FunctionCalls = getFuncCalls(func)
#1 # 1
features.append(FunctionCalls) features.append(FunctionCalls)
LogicInstr = getLogicInsts(func) LogicInstr = getLogicInsts(func)
#2 # 2
features.append(LogicInstr) features.append(LogicInstr)
Transfer = getTransferInsts(func) Transfer = getTransferInsts(func)
#3 # 3
features.append(Transfer) features.append(Transfer)
Locals = getLocalVariables(func) Locals = getLocalVariables(func)
#4 # 4
features.append(Locals) features.append(Locals)
BB = getBasicBlocks(func) BB = getBasicBlocks(func)
#5 # 5
features.append(BB) features.append(BB)
Edges = len(icfg.edges()) Edges = len(icfg.edges())
#6 # 6
features.append(Edges) features.append(Edges)
Incoming = getIncommingCalls(func) Incoming = getIncommingCalls(func)
#7 # 7
features.append(Incoming) features.append(Incoming)
#8 # 8
Instrs = getIntrs(func) Instrs = getIntrs(func)
features.append(Instrs) features.append(Instrs)
between = retrieveGP(icfg) between = retrieveGP(icfg)
#9 # 9
features.append(between) features.append(between)
strings, consts = getfunc_consts(func) strings, consts = getfunc_consts(func)
# 10
features.append(strings) features.append(strings)
# 11
features.append(consts) features.append(consts)
return features return features
def get_func_names(ea): def get_func_names(ea):
funcs = {} funcs = {}
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
@ -82,6 +112,7 @@ def get_func_names(ea):
funcs[funcname] = funcea funcs[funcname] = funcea
return funcs return funcs
def get_func_bases(ea): def get_func_bases(ea):
funcs = {} funcs = {}
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
@ -89,6 +120,7 @@ def get_func_bases(ea):
funcs[funcea] = funcname funcs[funcea] = funcname
return funcs return funcs
def get_func_range(ea): def get_func_range(ea):
funcs = {} funcs = {}
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
@ -97,6 +129,7 @@ def get_func_range(ea):
funcs[funcname] = (func.startEA, func.endEA) funcs[funcname] = (func.startEA, func.endEA)
return funcs return funcs
def get_func_sequences(ea): def get_func_sequences(ea):
funcs_bodylist = {} funcs_bodylist = {}
funcs = get_funcs(ea) funcs = get_funcs(ea)
@ -111,16 +144,17 @@ def get_func_sequences(ea):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return funcs_bodylist return funcs_bodylist
def get_func_cfgs(ea): def get_func_cfgs(ea):
func_cfglist = {} func_cfglist = {}
i = 0 i = 0
start, end = get_section('LOAD') start, end = get_section('LOAD')
#print start, end # print start, end
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
if start <= funcea <= end: if start <= funcea <= end:
funcname = GetFunctionName(funcea) funcname = GetFunctionName(funcea)
func = get_func(funcea) func = get_func(funcea)
print i print(i)
i += 1 i += 1
try: try:
icfg = cfg.cfg_construct(func) icfg = cfg.cfg_construct(func)
@ -130,6 +164,7 @@ def get_func_cfgs(ea):
return func_cfglist return func_cfglist
def get_section(t): def get_section(t):
base = SegByName(t) base = SegByName(t)
start = SegByBase(base) start = SegByBase(base)
@ -144,7 +179,7 @@ def get_func_cfg_sequences(func_cfglist):
cfg = func_cfglist[funcname][0] cfg = func_cfglist[funcname][0]
for start, end in cfg: for start, end in cfg:
codesq = get_sequences(start, end) codesq = get_sequences(start, end)
func_cfg_seqlist[funcname][(start,end)] = codesq func_cfg_seqlist[funcname][(start, end)] = codesq
return func_cfg_seqlist return func_cfg_seqlist
@ -158,8 +193,9 @@ def get_sequences(start, end):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return seq return seq
def get_stack_arg(func_addr): def get_stack_arg(func_addr):
print func_addr print(func_addr)
args = [] args = []
stack = GetFrame(func_addr) stack = GetFrame(func_addr)
if not stack: if not stack:
@ -167,18 +203,19 @@ def get_stack_arg(func_addr):
firstM = GetFirstMember(stack) firstM = GetFirstMember(stack)
lastM = GetLastMember(stack) lastM = GetLastMember(stack)
i = firstM i = firstM
while i <=lastM: while i <= lastM:
mName = GetMemberName(stack,i) mName = GetMemberName(stack, i)
mSize = GetMemberSize(stack,i) mSize = GetMemberSize(stack, i)
if mSize: if mSize:
i = i + mSize i = i + mSize
else: else:
i = i+4 i = i + 4
if mName not in args and mName and ' s' not in mName and ' r' not in mName: if mName not in args and mName and ' s' not in mName and ' r' not in mName:
args.append(mName) args.append(mName)
return args return args
#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
def processDataSegs(): def processDataSegs():
funcdata = {} funcdata = {}
@ -195,7 +232,7 @@ def processDataSegs():
refs = [v for v in DataRefsTo(cur)] refs = [v for v in DataRefsTo(cur)]
for fea in refs: for fea in refs:
name = GetFunctionName(fea) name = GetFunctionName(fea)
if len(name)== 0: if len(name) == 0:
continue continue
if name not in funcdata: if name not in funcdata:
funcdata[name] = [cur] funcdata[name] = [cur]
@ -208,6 +245,7 @@ def processDataSegs():
cur = NextHead(cur) cur = NextHead(cur)
return funcdata, datafunc return funcdata, datafunc
def obtainDataRefs(callgraph): def obtainDataRefs(callgraph):
datarefs = {} datarefs = {}
funcdata, datafunc = processDataSegs() funcdata, datafunc = processDataSegs()
@ -218,11 +256,9 @@ def obtainDataRefs(callgraph):
refs = datafunc[dd] refs = datafunc[dd]
refs = list(set(refs)) refs = list(set(refs))
if node in datarefs: if node in datarefs:
print refs print(refs)
datarefs[node] += refs datarefs[node] += refs
datarefs[node] = list(set(datarefs[node])) datarefs[node] = list(set(datarefs[node]))
else: else:
datarefs[node] = refs datarefs[node] = refs
return datarefs return datarefs

View File

@ -11,29 +11,34 @@ from idaapi import *
from idc import * from idc import *
import networkx as nx import networkx as nx
import cfg_constructor as cfg import cfg_constructor as cfg
import cPickle as pickle
import pdb import pdb
from raw_graphs import * from raw_graphs import *
#from discovRe_feature.discovRe import * #from discovRe_feature.discovRe import *
from discovRe import * from discovRe import *
sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
#import wingdbstub #import wingdbstub
#wingdbstub.Ensure() #wingdbstub.Ensure()
def print_obj(obj): def print_obj(obj):
"打印对象的所有属性" # "打印对象的所有属性"
print(obj.__dict__) print(obj.__dict__)
def gt_funcNames(ea): def gt_funcNames(ea):
funcs = [] funcs = []
plt_func, plt_data = processpltSegs() plt_func, plt_data = processpltSegs()
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea) funcname = get_unified_funcname(funcea)
if funcname in plt_func: if funcname in plt_func:
print funcname print(funcname)
continue continue
funcs.append(funcname) funcs.append(funcname)
return funcs return funcs
def get_funcs(ea): def get_funcs(ea):
funcs = {} funcs = {}
# Get current ea # Get current ea
@ -52,6 +57,7 @@ def get_funcs(ea):
funcs[funcname].append((start, end)) funcs[funcname].append((start, end))
return funcs return funcs
# used for the callgraph generation. # used for the callgraph generation.
def get_func_namesWithoutE(ea): def get_func_namesWithoutE(ea):
funcs = {} funcs = {}
@ -59,13 +65,14 @@ def get_func_namesWithoutE(ea):
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea) funcname = get_unified_funcname(funcea)
if 'close' in funcname: if 'close' in funcname:
print funcea print(funcea)
if funcname in plt_func: if funcname in plt_func:
print funcname print(funcname)
continue continue
funcs[funcname] = funcea funcs[funcname] = funcea
return funcs return funcs
# used for the callgraph generation. # used for the callgraph generation.
def get_func_names(ea): def get_func_names(ea):
funcs = {} funcs = {}
@ -74,6 +81,7 @@ def get_func_names(ea):
funcs[funcname] = funcea funcs[funcname] = funcea
return funcs return funcs
def get_func_bases(ea): def get_func_bases(ea):
funcs = {} funcs = {}
plt_func, plt_data = processpltSegs() plt_func, plt_data = processpltSegs()
@ -84,6 +92,7 @@ def get_func_bases(ea):
funcs[funcea] = funcname funcs[funcea] = funcname
return funcs return funcs
def get_func_range(ea): def get_func_range(ea):
funcs = {} funcs = {}
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
@ -92,6 +101,7 @@ def get_func_range(ea):
funcs[funcname] = (func.startEA, func.endEA) funcs[funcname] = (func.startEA, func.endEA)
return funcs return funcs
def get_unified_funcname(ea): def get_unified_funcname(ea):
funcname = GetFunctionName(ea) funcname = GetFunctionName(ea)
if len(funcname) > 0: if len(funcname) > 0:
@ -99,6 +109,7 @@ def get_unified_funcname(ea):
funcname = funcname[1:] funcname = funcname[1:]
return funcname return funcname
def get_func_sequences(ea): def get_func_sequences(ea):
funcs_bodylist = {} funcs_bodylist = {}
funcs = get_funcs(ea) funcs = get_funcs(ea)
@ -113,6 +124,7 @@ def get_func_sequences(ea):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return funcs_bodylist return funcs_bodylist
def get_func_cfgs_c(ea): def get_func_cfgs_c(ea):
# type: (object) -> object # type: (object) -> object
binary_name = idc.GetInputFile() binary_name = idc.GetInputFile()
@ -122,16 +134,18 @@ def get_func_cfgs_c(ea):
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea) funcname = get_unified_funcname(funcea)
func = get_func(funcea) func = get_func(funcea)
print i print(i)
i += 1 i += 1
icfg = cfg.getCfg(func, externs_eas, ea_externs) icfg = cfg.getCfg(func, externs_eas, ea_externs)
func_f = get_discoverRe_feature(func, icfg[0]) func_f = get_discoverRe_feature(func, icfg[0])
raw_g = raw_graph(funcname, icfg, func_f) #生成一个rawcfg。raw_graph是一个python class定义在 raw_graph.py.包含g本文的ACFG、olg_gdiscovRe的acfg、feature函数级别的一些特征以及betweenness bb_f = get_bb_features(func)
raw_g = raw_graph(funcname, icfg, func_f, bb_f) # todo 为每个bb生成bb_features
raw_cfgs.append(raw_g) # raw_graphs 是另一个python class存储raw_graph的list。定义在 raw_graph.py raw_cfgs.append(raw_g) # raw_graphs 是另一个python class存储raw_graph的list。定义在 raw_graph.py
#print(raw_g.__dict__) #print(raw_g.__dict__)
#print(raw_g) 由于raw_graph、raw_graphs都是class直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__) #print(raw_g) 由于raw_graph、raw_graphs都是class直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__)
return raw_cfgs return raw_cfgs
def get_func_cfgs_ctest(ea): def get_func_cfgs_ctest(ea):
binary_name = idc.GetInputFile() binary_name = idc.GetInputFile()
raw_cfgs = raw_graphs(binary_name) raw_cfgs = raw_graphs(binary_name)
@ -141,7 +155,7 @@ def get_func_cfgs_ctest(ea):
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea) funcname = get_unified_funcname(funcea)
func = get_func(funcea) func = get_func(funcea)
print i print(i)
i += 1 i += 1
icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs) icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
diffs[funcname] = (icfg, old_cfg) diffs[funcname] = (icfg, old_cfg)
@ -150,13 +164,14 @@ def get_func_cfgs_ctest(ea):
return diffs return diffs
def get_func_cfgs(ea): def get_func_cfgs(ea):
func_cfglist = {} func_cfglist = {}
i = 0 i = 0
for funcea in Functions(SegStart(ea)): for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea) funcname = get_unified_funcname(funcea)
func = get_func(funcea) func = get_func(funcea)
print i print(i)
i += 1 i += 1
try: try:
icfg = cfg.getCfg(func) icfg = cfg.getCfg(func)
@ -166,6 +181,7 @@ def get_func_cfgs(ea):
return func_cfglist return func_cfglist
def get_func_cfg_sequences(func_cfglist): def get_func_cfg_sequences(func_cfglist):
func_cfg_seqlist = {} func_cfg_seqlist = {}
for funcname in func_cfglist: for funcname in func_cfglist:
@ -187,8 +203,9 @@ def get_sequences(start, end):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return seq return seq
def get_stack_arg(func_addr): def get_stack_arg(func_addr):
print func_addr print(func_addr)
args = [] args = []
stack = GetFrame(func_addr) stack = GetFrame(func_addr)
if not stack: if not stack:
@ -206,9 +223,9 @@ def get_stack_arg(func_addr):
if mName not in args and mName and ' s' not in mName and ' r' not in mName: if mName not in args and mName and ' s' not in mName and ' r' not in mName:
args.append(mName) args.append(mName)
return args return args
#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
def processExternalSegs(): def processExternalSegs():
funcdata = {} funcdata = {}
datafunc = {} datafunc = {}
@ -226,6 +243,7 @@ def processExternalSegs():
cur = NextHead(cur) cur = NextHead(cur)
return funcdata return funcdata
def processpltSegs(): def processpltSegs():
funcdata = {} funcdata = {}
datafunc = {} datafunc = {}
@ -273,6 +291,7 @@ def processDataSegs():
cur = NextHead(cur) cur = NextHead(cur)
return funcdata, datafunc return funcdata, datafunc
def obtainDataRefs(callgraph): def obtainDataRefs(callgraph):
datarefs = {} datarefs = {}
funcdata, datafunc = processDataSegs() funcdata, datafunc = processDataSegs()
@ -283,7 +302,7 @@ def obtainDataRefs(callgraph):
refs = datafunc[dd] refs = datafunc[dd]
refs = list(set(refs)) refs = list(set(refs))
if node in datarefs: if node in datarefs:
print refs print(refs)
datarefs[node] += refs datarefs[node] += refs
datarefs[node] = list(set(datarefs[node])) datarefs[node] = list(set(datarefs[node]))
else: else:

View File

@ -1,3 +1,4 @@
# coding=utf-8
from idautils import * from idautils import *
from idaapi import * from idaapi import *
from idc import * from idc import *
@ -138,7 +139,7 @@ def get_stackVariables(func_addr):
return len(args) return len(args)
# 计算算数指令数量
def calArithmeticIns(bl): def calArithmeticIns(bl):
x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1} x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1} mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
@ -156,6 +157,7 @@ def calArithmeticIns(bl):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return invoke_num return invoke_num
# 计算调用数量
def calCalls(bl): def calCalls(bl):
calls = {'call':1, 'jal':1, 'jalr':1} calls = {'call':1, 'jal':1, 'jalr':1}
start = bl[0] start = bl[0]
@ -169,6 +171,7 @@ def calCalls(bl):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return invoke_num return invoke_num
# 计算指令数量
def calInsts(bl): def calInsts(bl):
start = bl[0] start = bl[0]
end = bl[1] end = bl[1]
@ -196,7 +199,23 @@ def calLogicInstructions(bl):
inst_addr = NextHead(inst_addr) inst_addr = NextHead(inst_addr)
return invoke_num return invoke_num
def calIns(bl, inst):
calls = {}
calls.update(inst)
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
if opcode in calls:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num
def calSconstants(bl): def calSconstants(bl):
calls = {}
start = bl[0] start = bl[0]
end = bl[1] end = bl[1]
invoke_num = 0 invoke_num = 0

View File

@ -0,0 +1,66 @@
# coding=utf-8
import os
import subprocess
import multiprocessing
from tqdm import tqdm
import time
# 单个pe文件处理超时/s
TIMEOUT = 60
def call_preprocess(cmd_line):
subprocess.call(cmd_line, shell=True)
def batch_mode():
for workflow in range(0, 1):
# workflow = 0
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
# for test
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
# pe = "VirusShare_bc161e5e792028e8137aa070fda53f82"
for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
if index < log_index:
continue
# for test
# cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
# workflow, os.path.join(pe_dir, pe))
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(workflow, os.path.join(pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start()
flag_kill = True
start = time.time()
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1)
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
process_log.write("index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
else:
# 正常运行结束
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, pe))
# 注意该py文件必须放在IDA的根目录下且必须使用cmd命令执行否则无法链接到python库
if __name__ == '__main__':
batch_mode()

View File

@ -1,56 +1,47 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import sys import pickle
from func import * from func import *
from raw_graphs import * from raw_graphs import *
from idc import * from idc import *
import idautils
import os import os
import argparse import sys
import raw_graphs
def print_obj(obj):
"打印对象的所有属性"
print(obj.__dict__)
def parse_command(): def preprocess():
parser = argparse.ArgumentParser(description='Process some integers.') # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file") # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
args = parser.parse_args() # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
return args # print idc.ARGV[2]
# print type(idc.ARGV[2])
if __name__ == '__main__': binary_name = idc.GetInputFile()
#E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
#print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
#print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
#print idc.ARGV[2]
#print type(idc.ARGV[2])
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius4\acfgs" hpcenter workflow = idc.ARGV[1]
#测试生成原始特征的时间。 # workflow = 0
start_t = time.clock() cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
args = parse_command()
#path = args.path
path = idc.ARGV[2]
analysis_flags = idc.GetShortPrm(idc.INF_START_AF) analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF analysis_flags &= ~idc.AF_IMMOFF
# turn off "automatically make offset" heuristic
idc.SetShortPrm(idc.INF_START_AF, analysis_flags) idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
idaapi.autoWait() idaapi.autoWait()
# 生成pe文件的cfg列表
cfgs = get_func_cfgs_c(FirstSeg()) cfgs = get_func_cfgs_c(FirstSeg())
# 生成pe文件的fcg
# idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件网上几乎找不到gdl这个格式
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
end_t = time.clock() full_path = os.path.join(cfg_path, binary_name + '.ida')
print (end_t - start_t) #1.5934438s hpcenter 83.4 KB #35.6745299s SCGDW698 5.5mb #14.1480888s 762kb SCMQTTIot 这个时间包括ida分析二进制文件的时间和脚本生成对应原始特征的时间 pickle.dump(cfgs, open(full_path, 'w'))
# 应该是随着函数和基本块的数量增加而线性增加的先不写了。可能ida分析二进制文件的占比比较高
binary_name = idc.GetInputFile() + '.ida' # 由于命令行模式也必须打开ida pro因此每次结束自动关闭ida
print path idc.Exit(0)
print binary_name
fullpath = os.path.join(path, binary_name)
pickle.dump(cfgs, open(fullpath,'w'))
#print binary_name
# 通用命令行格式 idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
#加上这句脚本执行完就退出IDA # 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path完整命令行如下
#idc.Exit(0) # F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
if __name__ == '__main__':
preprocess()

View File

@ -2,24 +2,26 @@
import itertools import itertools
import sys import sys
sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/') # sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
sys.path.insert(1, 'C:/Python27/Lib/site-packages') # sys.path.insert(1, 'C:/Python27/Lib/site-packages')
import networkx as nx import networkx as nx
#import numpy as np import numpy as np
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
import pdb import pdb
import os import os
import re,mmap import re
#from graph_edit_new import * import mmap
# from graph_edit_new import *
class raw_graph: class raw_graph:
def __init__(self, funcname, g, func_f): def __init__(self, funcname, g, func_f, bb_f):
#print "create" #print "create"
self.funcname = funcname self.funcname = funcname
self.old_g = g[0] self.old_g = g[0]
self.g = nx.DiGraph() self.g = nx.DiGraph()
self.entry = g[1] self.entry = g[1]
self.bb_features = bb_f # len=bb数量,每个元素都是一个11维向量
self.fun_features = func_f self.fun_features = func_f
self.attributing() self.attributing()
@ -54,6 +56,9 @@ class raw_graph:
offsprings[suc] = 1 offsprings[suc] = 1
self.getOffsprings(g, suc, offsprings) self.getOffsprings(g, suc, offsprings)
# 提取acfg的属性特征
# 调用/传输/算术/逻辑/比较/移动/终止
# 数据声明/总指令数/字符串或整数常量/后代的数量
def retrieveVec(self, id_, g): def retrieveVec(self, id_, g):
feature_vec = [] feature_vec = []
#numC0 #numC0
@ -96,7 +101,7 @@ class raw_graph:
def genMotifs(self, n): def genMotifs(self, n):
motifs = {} motifs = {}
subgs = enumerating(n) subgs = self.enumerating(n)
for subg in subgs: for subg in subgs:
if len(motifs) == 0: if len(motifs) == 0:
motifs[subg] = [subg] motifs[subg] = [subg]
@ -182,7 +187,7 @@ class raw_graph:
tg.updateG(fang, indexes, self.g) tg.updateG(fang, indexes, self.g)
return tg return tg
pdb.set_trace() pdb.set_trace()
print "there is g which is none" print("there is g which is none")
def createG(self, binary_str, n): def createG(self, binary_str, n):
g = nx.DiGraph() g = nx.DiGraph()

View File

@ -1,70 +1,71 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import sys import sys
import sys
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
sys.path.insert(1, 'C:/Python27/Lib/site-packages')
import networkx as nx import networkx as nx
import pickle
# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
def print_obj(obj): def print_obj(obj):
"打印对象的所有属性" # "打印对象的所有属性"
print(obj.__dict__) print(obj.__dict__)
import pickle
#sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。 # sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。
#sub_166C4 393 # sub_166C4 393
if __name__ == '__main__': if __name__ == '__main__':
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected0_cfg\\VirusShare_cd53c6637ca75ac5fc1cbe6d2ced41a1.ida"
testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
fr = open(testpath, 'r') fr = open(testpath, 'r')
data1 = pickle.load(fr) #一个二进制文件的acfgs data = pickle.load(fr) #一个二进制文件的acfgs
#print(type(data1)) fr.close()
#print_obj(data1)
#print data1.raw_graph_list[393]
#print_obj(data1.raw_graph_list[393])
#nx.draw(data1.raw_graph_list[393].g,with_labels=True)
#plt.show()
print "一个二进制文件的所有函数的原始特征list。" # print(type(data1))
print_obj(data1) #acfg list # print_obj(data1)
print "\n" # print data1.raw_graph_list[393]
# print_obj(data1.raw_graph_list[393])
# nx.draw(data1.raw_graph_list[393].g,with_labels=True)
# plt.show()
print "一个函数的原始特征由old_gdiscovRe方法的ACFGgGenius方法的ACFGfun_feature表示函数级别的特征的向量三部分构成" print("一个二进制文件的所有函数的原始特征list。")
print_obj(data1.raw_graph_list[393]) #一个函数的acfg print_obj(data) # acfg list
print "\n" print("\n")
feature=data1.raw_graph_list[393].fun_features
print "函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts" print("一个函数的原始特征由old_gdiscovRe方法的ACFGgGenius方法的ACFGfun_feature表示函数级别的特征的向量三部分构成")
print feature print_obj(data.raw_graph_list[0]) # 一个函数的acfg
print "\n" print("其中fun_features = 函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
# feature = data.raw_graph_list[0].fun_features
print("old_g:{}".format(data.raw_graph_list[0].old_g))
print("g:{}".format(data.raw_graph_list[0].g))
# G=data1.raw_graph_list[393].old_g # G = data1.raw_graph_list[393].old_g
# print G.node[0] # G.node[i]是dict # print G.node[0] # G.node[i]是dict
# for key, value in G.node[0].items(): # for key, value in G.node[0].items():
# print('{key}:{value}'.format(key=key, value=value)) # print('{key}:{value}'.format(key=key, value=value))
# 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量 # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
G=data1.raw_graph_list[393].g G = data.raw_graph_list[0].g
print "# 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量" print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
print G.node[0] # print(G.node[0])
print "\n" # print("\n")
# for key, value in G.node[0].items(): # 函数内所有基本快的特征
# print('{key}:{value}'.format(key=key, value=value)) for key, value in G.node.items():
print('{}:{}'.format(key, value))
#oldg就是读取IDA的CFG所以数量、方向等都一样g根据old_g生成也一样 #oldg就是读取IDA的CFG所以数量、方向等都一样g根据old_g生成也一样
#old g #old g
G = data1.raw_graph_list[393].old_g G = data.raw_graph_list[0].old_g
nx.draw(G,with_labels=True) nx.draw(G, with_labels=True)
#plt.title('old_g') #plt.title('old_g')
plt.show() plt.show()
# g # g
G = data1.raw_graph_list[393].g G = data.raw_graph_list[0].g
nx.draw(G,with_labels=True) nx.draw(G, with_labels=True)
#plt.title('Genius_g') #plt.title('Genius_g')
plt.show() plt.show()

View File

@ -1,8 +1,70 @@
import re
import os
import subprocess
import time
def func():
path = "D:\\hkn\\infected\\datasets\\virusshare_infected0_dot\\VirusShare_ccbfc20470b099a188bda55aa8421427.dot"
result = []
with open(path, 'r') as f:
for line in f:
if '->' in line:
result.append(re.findall(r'\b\d+\b', line))
print(result)
def func1():
for f in os.listdir("D:\\hkn\\infected\\datasets\\virusshare_infected0_dot"):
print(f[:-4])
def gen_dir():
parent_dir = "D:\\hkn\\infected\\datasets"
for workflow in range(0, 35):
# infected = "virusshare_infected{}".format(workflow)
# cfg = "virusshare_infected{}_cfg".format(workflow)
# dot = "virusshare_infected{}_dot".format(workflow)
# jsonl = "virusshare_infected{}_json".format(workflow)
iout = "virusshare_infected{}_iout".format(workflow)
# os.mkdir(os.path.join(parent_dir, infected))
# os.mkdir(os.path.join(parent_dir, cfg))
# os.mkdir(os.path.join(parent_dir, dot))
# os.mkdir(os.path.join(parent_dir, jsonl))
os.rmdir(os.path.join(parent_dir, iout))
# os.rmdir(os.path.join(parent_dir, ida))
def change_max_item_lines():
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
s = f.read()
f.close()
index = s.find(b'MAX_ITEM_LINES = 5000')
news = s.replace(b'MAX_ITEM_LINES = 5000', b'MAX_ITEM_LINES = 50000')
# print(news[index:index+50])
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'wb')
f.write(news)
f.close()
def clock():
TIMEOUT = 10
start = time.time()
flag_kill = True
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1) # Just to avoid hogging the CPU
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
if __name__ == '__main__':
# gen_dir()
# change_max_item_lines()
subprocess.call('taskkill /im idaq64.exe /f')
import pickle
testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
fr = open(testpath, 'r')
data1 = pickle.load(fr)
print(type(data1))
# # print_obj(data1)
# print cfgs.raw_graph_list[0]

View File

@ -1,286 +0,0 @@
import copy
import networkx as nx
from idautils import *
from idaapi import *
from idc import *
import copy
import networkx as nx
from idautils import *
from idaapi import *
from idc import *
from graph_analysis_ida import *
def getCfg(func, externs_eas, ea_externs):
func_start = func.startEA
func_end = func.endEA
cfg = nx.DiGraph()
control_blocks, main_blocks = obtain_block_sequence(func)
i = 0
visited = {}
start_node = None
for bl in control_blocks:
start = control_blocks[bl][0]
end = control_blocks[bl][1]
src_node = (start, end)
if src_node not in visited:
src_id = len(cfg)
visited[src_node] = src_id
cfg.add_node(src_id)
cfg.node[src_id]['label'] = src_node
else:
src_id = visited[src_node]
#if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
if start == func_start:
cfg.node[src_id]['c'] = "start"
start_node = src_node
if end == func_end:
cfg.node[src_id]['c'] = "end"
#print control_ea, 1
refs = CodeRefsTo(start, 0)
for ref in refs:
if ref in control_blocks:
dst_node = control_blocks[ref]
if dst_node not in visited:
visited[dst_node] = len(cfg)
dst_id = visited[dst_node]
cfg.add_edge(dst_id, src_id)
cfg.node[dst_id]['label'] = dst_node
#print control_ea, 1
refs = CodeRefsTo(start, 1)
for ref in refs:
if ref in control_blocks:
dst_node = control_blocks[ref]
if dst_node not in visited:
visited[dst_node] = len(cfg)
dst_id = visited[dst_node]
cfg.add_edge(dst_id, src_id)
cfg.node[dst_id]['label'] = dst_node
#print "attributing"
attributingRe(cfg, externs_eas, ea_externs)
# removing deadnodes
#old_cfg = copy.deepcopy(cfg)
#transform(cfg)
return cfg, 0
def transform(cfg):
merging(cfg)
filtering(cfg)
def merging(cfg):
bb_ids = cfg.nodes()
for bb_id in bb_ids:
try:
bb = cfg.node[bb_id]['label']
bb_start = bb[0]
bb_end = bb[1]
succs = cfg.successors(bb_id)
#preds = cfg.predecessors(bb_id)
if len(succs) == 1:
preds = cfg.predecessors(succs[0])
if len(preds) == 1:
domerge(cfg, bb_id, succs[0])
except:
pass
def domerge(cfg, bb_id, suc_node):
suc_nodes = cfg.successors(suc_node)
for node in suc_nodes:
cfg.add_edge(bb_id, node)
cfg.remove_node(suc_node)
def filtering(cfg):
rm_sets = []
for bb_id in cfg:
bb = cfg.node[bb_id]['label']
bb_start = bb[0]
bb_end = bb[1]
re = remove(bb_start, bb_end)
print bb_id, re, bb_start, bb_end
if re:
print re, bb_id
rm_sets.append(bb_id)
print rm_sets
for bb_id in rm_sets:
cfg.remove_node(bb_id)
def remove(bb_start, bb_end):
seqs = getSequences(bb_start, bb_end)
if matchseq(seqs):
return True
return False
def matchseq(seqs):
mips = set(['lw', "jr", "addiu"])
x86 = set(['add', 'pop', 'retn'])
b_mips = set(['b', ('move','$v0')])
b_x86 = set(['b', ('mov','$eax')])
re_mips = set([('move','$v0')])
re_x86 = set([('mov','$eax')])
diff_mips = set(seqs).difference(set(mips))
if len(diff_mips) == 0:
return True
diff_x86 = set(seqs).difference(set(x86))
if len(diff_x86) == 0:
return True
if set(seqs) == b_mips:
return True
if set(seqs) == b_x86:
return True
if set(seqs) == re_mips:
return True
if set(seqs) == re_x86:
return True
return False
def attributingRe(cfg, externs_eas, ea_externs):
for node_id in cfg:
bl = cfg.node[node_id]['label']
numIns = calInsts(bl)
cfg.node[node_id]['numIns'] = numIns
numCalls = calCalls(bl)
cfg.node[node_id]['numCalls'] = numCalls
numLIs = calLogicInstructions(bl)
cfg.node[node_id]['numLIs'] = numLIs
numAs = calArithmeticIns(bl)
cfg.node[node_id]['numAs'] = numAs
strings, consts = getBBconsts(bl)
cfg.node[node_id]['numNc'] = len(strings) + len(consts)
cfg.node[node_id]['consts'] = consts
cfg.node[node_id]['strings'] = strings
externs = retrieveExterns(bl, ea_externs)
cfg.node[node_id]['externs'] = externs
numTIs = calTransferIns(bl)
cfg.node[node_id]['numTIs'] = numTIs
def attributing(cfg):
ga = graph_analysis()
ga.gwithoffspring(cfg)
print "finishing offspring"
for node in cfg:
stmt_num = getStmtNum(node)
binary_value = getBinaryValue(node)
cfg.node[node]['stmt_num'] = stmt_num
cfg.node[node]['binary_value'] = binary_value
ga.domChecking(cfg)
print "finishing domChecking"
ga.loopChecking(cfg)
print "finishing loopChecking"
def getStmtNum(node):
start = node[0]
end = node[1]
stmt_num = 0
inst_addr = start
while inst_addr < end:
inst_addr = NextHead(inst_addr)
stmt_num += 1
return stmt_num
def getBinaryValue(node):
start = node[0]
inst_addr = NextHead(start)
value = 0
addr = 0
for x in xrange((inst_addr - start)-1):
addr = start + x
y = GetOriginalByte(addr)
print value, addr, y
value = value | y
value = value << 8
print value
addr = inst_addr - 1
y = GetOriginalByte(addr)
print value, addr, y
value = value | y
print node
print bin(value)
return value
def cfg_construct(func):
func_start = func.startEA
func_end = func.endEA
cfg = nx.DiGraph()
seq_blocks, main_blocks = obtain_block_sequence(func)
i = 0
visited = {}
for bl in seq_blocks:
start = seq_blocks[bl][0]
end = seq_blocks[bl][1]
src_node = (start, end)
if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
next_start = seq_blocks[end][0]
next_end = seq_blocks[end][1]
next_node = (next_start, next_end)
cfg.add_edge(src_node, next_node)
if start == func_start:
cfg.add_node(src_node, c='start')
start_node = src_node
if end == func_end:
cfg.add_node(src_node, c='end')
refs = CodeRefsFrom(PrevHead(end), 0)
for ref in refs:
#print ref
if ref in seq_blocks:
dst_node = (seq_blocks[ref][0], seq_blocks[ref][1])
cfg.add_edge(src_node, dst_node)
return cfg, start_node
def obtain_allpaths( cfg, node, path, allpaths):
path.append(node)
if 'c' in cfg.node[node] and cfg.node[node]['c'] == 'end':
allpaths.append(path)
return
else:
for suc in cfg.successors(node):
if suc not in path:
path_copy = copy.copy(path)
obtain_allpaths(cfg, suc, path_copy, allpaths)
def obtain_block_sequence(func):
control_blocks = {}
main_blocks = {}
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
for bl in blocks:
base = bl[0]
end = PrevHead(bl[1])
control_ea = checkCB(bl)
control_blocks[control_ea] = bl
control_blocks[end] = bl
if func.startEA <= base <= func.endEA:
main_blocks[base] = bl
x = sorted(main_blocks)
return control_blocks, x
def checkCB(bl):
start = bl[0]
end = bl[1]
ea = start
while ea < end:
if checkCondition(ea):
return ea
ea = NextHead(ea)
return PrevHead(end)
def checkCondition(ea):
mips_branch = {"beqz":1, "beq":1, "bne":1, "bgez":1, "b":1, "bnez":1, "bgtz":1, "bltz":1, "blez":1, "bgt":1, "bge":1, "blt":1, "ble":1, "bgtu":1, "bgeu":1, "bltu":1, "bleu":1}
x86_branch = {"jz":1, "jnb":1, "jne":1, "je":1, "jg":1, "jle":1, "jl":1, "jge":1, "ja":1, "jae":1, "jb":1, "jbe":1, "jo":1, "jno":1, "js":1, "jns":1}
arm_branch = {"B":1, "BAL":1, "BNE":1, "BEQ":1, "BPL":1, "BMI":1, "BCC":1, "BLO":1, "BCS":1, "BHS":1, "BVC":1, "BVS":1, "BGT":1, "BGE":1, "BLT":1, "BLE":1, "BHI":1 ,"BLS":1 }
conds = {}
conds.update(mips_branch)
conds.update(x86_branch)
opcode = GetMnem(ea)
if opcode in conds:
return True
return False

View File

@ -1,228 +0,0 @@
#
# Reference Lister
#
# List all functions and all references to them in the current section.
#
# Implemented with the idautils module
#
import networkx as nx
import cPickle as pickle
import pdb
from graph_analysis_ida import *
from graph_property import *
#import wingdbstub
#wingdbstub.Ensure()
def get_funcs(ea):
funcs = {}
# Get current ea
# Loop from start to end in the current segment
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
func = get_func(funcea)
blocks = FlowChart(func)
funcs[funcname] = []
for bl in blocks:
start = bl.startEA
end = bl.endEA
funcs[funcname].append((start, end))
return funcs
def get_funcs_for_discoverRe(ea):
features = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
print funcname
func = get_func(funcea)
feature = get_discoverRe_feature(func)
features[funcname] = feature
return features
def get_discoverRe_feature(func, icfg):
start = func.startEA
end = func.endEA
features = []
FunctionCalls = getFuncCalls(func)
#1
features.append(FunctionCalls)
LogicInstr = getLogicInsts(func)
#2
features.append(LogicInstr)
Transfer = getTransferInsts(func)
#3
features.append(Transfer)
Locals = getLocalVariables(func)
#4
features.append(Locals)
BB = getBasicBlocks(func)
#5
features.append(BB)
Edges = len(icfg.edges())
#6
features.append(Edges)
Incoming = getIncommingCalls(func)
#7
features.append(Incoming)
#8
Instrs = getIntrs(func)
features.append(Instrs)
between = retrieveGP(icfg)
#9
features.append(between)
strings, consts = getfunc_consts(func)
features.append(strings)
features.append(consts)
return features
def get_func_names(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
funcs[funcname] = funcea
return funcs
def get_func_bases(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
funcs[funcea] = funcname
return funcs
def get_func_range(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
func = get_func(funcea)
funcs[funcname] = (func.startEA, func.endEA)
return funcs
def get_func_sequences(ea):
funcs_bodylist = {}
funcs = get_funcs(ea)
for funcname in funcs:
if funcname not in funcs_bodylist:
funcs_bodylist[funcname] = []
for start, end in funcs[funcname]:
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
funcs_bodylist[funcname].append(opcode)
inst_addr = NextHead(inst_addr)
return funcs_bodylist
def get_func_cfgs(ea):
func_cfglist = {}
i = 0
start, end = get_section('LOAD')
#print start, end
for funcea in Functions(SegStart(ea)):
if start <= funcea <= end:
funcname = GetFunctionName(funcea)
func = get_func(funcea)
print i
i += 1
try:
icfg = cfg.cfg_construct(func)
func_cfglist[funcname] = icfg
except:
pass
return func_cfglist
def get_section(t):
base = SegByName(t)
start = SegByBase(base)
end = SegEnd(start)
return start, end
def get_func_cfg_sequences(func_cfglist):
func_cfg_seqlist = {}
for funcname in func_cfglist:
func_cfg_seqlist[funcname] = {}
cfg = func_cfglist[funcname][0]
for start, end in cfg:
codesq = get_sequences(start, end)
func_cfg_seqlist[funcname][(start,end)] = codesq
return func_cfg_seqlist
def get_sequences(start, end):
seq = []
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
seq.append(opcode)
inst_addr = NextHead(inst_addr)
return seq
def get_stack_arg(func_addr):
print func_addr
args = []
stack = GetFrame(func_addr)
if not stack:
return []
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <=lastM:
mName = GetMemberName(stack,i)
mSize = GetMemberSize(stack,i)
if mSize:
i = i + mSize
else:
i = i+4
if mName not in args and mName and ' s' not in mName and ' r' not in mName:
args.append(mName)
return args
#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
def processDataSegs():
funcdata = {}
datafunc = {}
for n in xrange(idaapi.get_segm_qty()):
seg = idaapi.getnseg(n)
ea = seg.startEA
segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
start = idc.SegStart(ea)
end = idc.SegEnd(ea)
cur = start
while cur <= end:
refs = [v for v in DataRefsTo(cur)]
for fea in refs:
name = GetFunctionName(fea)
if len(name)== 0:
continue
if name not in funcdata:
funcdata[name] = [cur]
else:
funcdata[name].append(cur)
if cur not in datafunc:
datafunc[cur] = [name]
else:
datafunc[cur].append(name)
cur = NextHead(cur)
return funcdata, datafunc
def obtainDataRefs(callgraph):
datarefs = {}
funcdata, datafunc = processDataSegs()
for node in callgraph:
if node in funcdata:
datas = funcdata[node]
for dd in datas:
refs = datafunc[dd]
refs = list(set(refs))
if node in datarefs:
print refs
datarefs[node] += refs
datarefs[node] = list(set(datarefs[node]))
else:
datarefs[node] = refs
return datarefs

View File

@ -1,285 +0,0 @@
#
# Reference Lister
#
# List all functions and all references to them in the current section.
#
# Implemented with the idautils module
#
from idautils import *
from idaapi import *
from idc import *
import networkx as nx
import cfg_constructor as cfg
import cPickle as pickle
import pdb
from raw_graphs import *
#from discovRe_feature.discovRe import *
from discovRe import *
#import wingdbstub
#wingdbstub.Ensure()
def gt_funcNames(ea):
funcs = []
plt_func, plt_data = processpltSegs()
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
if funcname in plt_func:
print funcname
continue
funcs.append(funcname)
return funcs
def get_funcs(ea):
funcs = {}
# Get current ea
# Loop from start to end in the current segment
plt_func, plt_data = processpltSegs()
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
if funcname in plt_func:
continue
func = get_func(funcea)
blocks = FlowChart(func)
funcs[funcname] = []
for bl in blocks:
start = bl.startEA
end = bl.endEA
funcs[funcname].append((start, end))
return funcs
# used for the callgraph generation.
def get_func_namesWithoutE(ea):
funcs = {}
plt_func, plt_data = processpltSegs()
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
if 'close' in funcname:
print funcea
if funcname in plt_func:
print funcname
continue
funcs[funcname] = funcea
return funcs
# used for the callgraph generation.
def get_func_names(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
funcs[funcname] = funcea
return funcs
def get_func_bases(ea):
funcs = {}
plt_func, plt_data = processpltSegs()
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
if funcname in plt_func:
continue
funcs[funcea] = funcname
return funcs
def get_func_range(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
func = get_func(funcea)
funcs[funcname] = (func.startEA, func.endEA)
return funcs
def get_unified_funcname(ea):
funcname = GetFunctionName(ea)
if len(funcname) > 0:
if '.' == funcname[0]:
funcname = funcname[1:]
return funcname
def get_func_sequences(ea):
funcs_bodylist = {}
funcs = get_funcs(ea)
for funcname in funcs:
if funcname not in funcs_bodylist:
funcs_bodylist[funcname] = []
for start, end in funcs[funcname]:
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
funcs_bodylist[funcname].append(opcode)
inst_addr = NextHead(inst_addr)
return funcs_bodylist
def get_func_cfgs_c(ea):
binary_name = idc.GetInputFile()
raw_cfgs = raw_graphs(binary_name)
externs_eas, ea_externs = processpltSegs()
i = 0
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
func = get_func(funcea)
print i
i += 1
icfg = cfg.getCfg(func, externs_eas, ea_externs)
func_f = get_discoverRe_feature(func, icfg[0])
raw_g = raw_graph(funcname, icfg, func_f)
raw_cfgs.append(raw_g)
return raw_cfgs
def get_func_cfgs_ctest(ea):
binary_name = idc.GetInputFile()
raw_cfgs = raw_graphs(binary_name)
externs_eas, ea_externs = processpltSegs()
i = 0
diffs = {}
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
func = get_func(funcea)
print i
i += 1
icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
diffs[funcname] = (icfg, old_cfg)
#raw_g = raw_graph(funcname, icfg)
#raw_cfgs.append(raw_g)
return diffs
def get_func_cfgs(ea):
func_cfglist = {}
i = 0
for funcea in Functions(SegStart(ea)):
funcname = get_unified_funcname(funcea)
func = get_func(funcea)
print i
i += 1
try:
icfg = cfg.getCfg(func)
func_cfglist[funcname] = icfg
except:
pass
return func_cfglist
def get_func_cfg_sequences(func_cfglist):
func_cfg_seqlist = {}
for funcname in func_cfglist:
func_cfg_seqlist[funcname] = {}
cfg = func_cfglist[funcname][0]
for start, end in cfg:
codesq = get_sequences(start, end)
func_cfg_seqlist[funcname][(start,end)] = codesq
return func_cfg_seqlist
def get_sequences(start, end):
seq = []
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
seq.append(opcode)
inst_addr = NextHead(inst_addr)
return seq
def get_stack_arg(func_addr):
print func_addr
args = []
stack = GetFrame(func_addr)
if not stack:
return []
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <=lastM:
mName = GetMemberName(stack,i)
mSize = GetMemberSize(stack,i)
if mSize:
i = i + mSize
else:
i = i+4
if mName not in args and mName and ' s' not in mName and ' r' not in mName:
args.append(mName)
return args
#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
def processExternalSegs():
funcdata = {}
datafunc = {}
for n in xrange(idaapi.get_segm_qty()):
seg = idaapi.getnseg(n)
ea = seg.startEA
segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
if segtype in [idc.SEG_XTRN]:
start = idc.SegStart(ea)
end = idc.SegEnd(ea)
cur = start
while cur <= end:
name = get_unified_funcname(cur)
funcdata[name] = hex(cur)
cur = NextHead(cur)
return funcdata
def processpltSegs():
funcdata = {}
datafunc = {}
for n in xrange(idaapi.get_segm_qty()):
seg = idaapi.getnseg(n)
ea = seg.startEA
segname = SegName(ea)
if segname in ['.plt', 'extern', '.MIPS.stubs']:
start = seg.startEA
end = seg.endEA
cur = start
while cur < end:
name = get_unified_funcname(cur)
funcdata[name] = hex(cur)
datafunc[cur]= name
cur = NextHead(cur)
return funcdata, datafunc
def processDataSegs():
funcdata = {}
datafunc = {}
for n in xrange(idaapi.get_segm_qty()):
seg = idaapi.getnseg(n)
ea = seg.startEA
segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
start = idc.SegStart(ea)
end = idc.SegEnd(ea)
cur = start
while cur <= end:
refs = [v for v in DataRefsTo(cur)]
for fea in refs:
name = get_unified_funcname(fea)
if len(name)== 0:
continue
if name not in funcdata:
funcdata[name] = [cur]
else:
funcdata[name].append(cur)
if cur not in datafunc:
datafunc[cur] = [name]
else:
datafunc[cur].append(name)
cur = NextHead(cur)
return funcdata, datafunc
def obtainDataRefs(callgraph):
datarefs = {}
funcdata, datafunc = processDataSegs()
for node in callgraph:
if node in funcdata:
datas = funcdata[node]
for dd in datas:
refs = datafunc[dd]
refs = list(set(refs))
if node in datarefs:
print refs
datarefs[node] += refs
datarefs[node] = list(set(datarefs[node]))
else:
datarefs[node] = refs
return datarefs

View File

@ -1,257 +0,0 @@
from idautils import *
from idaapi import *
from idc import *
def getfunc_consts(func):
strings = []
consts = []
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
for bl in blocks:
strs, conts = getBBconsts(bl)
strings += strs
consts += conts
return strings, consts
def getConst(ea, offset):
strings = []
consts = []
optype1 = GetOpType(ea, offset)
if optype1 == idaapi.o_imm:
imm_value = GetOperandValue(ea, offset)
if 0<= imm_value <= 10:
consts.append(imm_value)
else:
if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value):
str_value = GetString(imm_value)
if str_value is None:
str_value = GetString(imm_value+0x40000)
if str_value is None:
consts.append(imm_value)
else:
re = all(40 <= ord(c) < 128 for c in str_value)
if re:
strings.append(str_value)
else:
consts.append(imm_value)
else:
re = all(40 <= ord(c) < 128 for c in str_value)
if re:
strings.append(str_value)
else:
consts.append(imm_value)
else:
consts.append(imm_value)
return strings, consts
def getBBconsts(bl):
strings = []
consts = []
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
if opcode in ['la','jalr','call', 'jal']:
inst_addr = NextHead(inst_addr)
continue
strings_src, consts_src = getConst(inst_addr, 0)
strings_dst, consts_dst = getConst(inst_addr, 1)
strings += strings_src
strings += strings_dst
consts += consts_src
consts += consts_dst
try:
strings_dst, consts_dst = getConst(inst_addr, 2)
consts += consts_dst
strings += strings_dst
except:
pass
inst_addr = NextHead(inst_addr)
return strings, consts
def getFuncCalls(func):
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
sumcalls = 0
for bl in blocks:
callnum = calCalls(bl)
sumcalls += callnum
return sumcalls
def getLogicInsts(func):
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
sumcalls = 0
for bl in blocks:
callnum = calLogicInstructions(bl)
sumcalls += callnum
return sumcalls
def getTransferInsts(func):
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
sumcalls = 0
for bl in blocks:
callnum = calTransferIns(bl)
sumcalls += callnum
return sumcalls
def getIntrs(func):
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
sumcalls = 0
for bl in blocks:
callnum = calInsts(bl)
sumcalls += callnum
return sumcalls
def getLocalVariables(func):
args_num = get_stackVariables(func.startEA)
return args_num
def getBasicBlocks(func):
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
return len(blocks)
def getIncommingCalls(func):
refs = CodeRefsTo(func.startEA, 0)
re = len([v for v in refs])
return re
def get_stackVariables(func_addr):
#print func_addr
args = []
stack = GetFrame(func_addr)
if not stack:
return 0
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <=lastM:
mName = GetMemberName(stack,i)
mSize = GetMemberSize(stack,i)
if mSize:
i = i + mSize
else:
i = i+4
if mName not in args and mName and 'var_' in mName:
args.append(mName)
return len(args)
def calArithmeticIns(bl):
x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
calls = {}
calls.update(x86_AI)
calls.update(mips_AI)
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
if opcode in calls:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num
def calCalls(bl):
calls = {'call':1, 'jal':1, 'jalr':1}
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
if opcode in calls:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num
def calInsts(bl):
start = bl[0]
end = bl[1]
ea = start
num = 0
while ea < end:
num += 1
ea = NextHead(ea)
return num
def calLogicInstructions(bl):
x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1}
mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1}
calls = {}
calls.update(x86_LI)
calls.update(mips_LI)
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
if opcode in calls:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num
def calSconstants(bl):
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
if opcode in calls:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num
def calNconstants(bl):
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
optype1 = GetOpType(inst_addr, 0)
optype2 = GetOpType(inst_addr, 1)
if optype1 == 5 or optype2 == 5:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num
def retrieveExterns(bl, ea_externs):
externs = []
start = bl[0]
end = bl[1]
inst_addr = start
while inst_addr < end:
refs = CodeRefsFrom(inst_addr, 1)
try:
ea = [v for v in refs if v in ea_externs][0]
externs.append(ea_externs[ea])
except:
pass
inst_addr = NextHead(inst_addr)
return externs
def calTransferIns(bl):
x86_TI = {'jmp':1, 'jz':1, 'jnz':1, 'js':1, 'je':1, 'jne':1, 'jg':1, 'jle':1, 'jge':1, 'ja':1, 'jnc':1, 'call':1}
mips_TI = {'beq':1, 'bne':1, 'bgtz':1, "bltz":1, "bgez":1, "blez":1, 'j':1, 'jal':1, 'jr':1, 'jalr':1}
arm_TI = {'MVN':1, "MOV":1}
calls = {}
calls.update(x86_TI)
calls.update(mips_TI)
start = bl[0]
end = bl[1]
invoke_num = 0
inst_addr = start
while inst_addr < end:
opcode = GetMnem(inst_addr)
re = [v for v in calls if opcode in v]
if len(re) > 0:
invoke_num += 1
inst_addr = NextHead(inst_addr)
return invoke_num

View File

@ -1,24 +0,0 @@
import networkx as nx
import pdb
def betweeness(g):
#pdb.set_trace()
betweenness = nx.betweenness_centrality(g)
return betweenness
def eigenvector(g):
centrality = nx.eigenvector_centrality(g)
return centrality
def closeness_centrality(g):
closeness = nx.closeness_centrality(g)
return closeness
def retrieveGP(g):
bf = betweeness(g)
#close = closeness_centrality(g)
#bf_sim =
#close_sim =
x = sorted(bf.values())
value = sum(x)/len(x)
return round(value,5)

View File

@ -1,27 +0,0 @@
from func import *
from raw_graphs import *
from idc import *
import os
import argparse
def parse_command():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file")
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_command()
path = args.path
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF
# turn off "automatically make offset" heuristic
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
idaapi.autoWait()
cfgs = get_func_cfgs_c(FirstSeg())
binary_name = idc.GetInputFile() + '.ida'
fullpath = os.path.join(path, binary_name)
pickle.dump(cfgs, open(fullpath,'w'))
print binary_name
idc.Exit(0)

View File

@ -1,286 +0,0 @@
import itertools
import sys
sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
import networkx as nx
#import numpy as np
from subprocess import Popen, PIPE
import pdb
import os
import re,mmap
#from graph_edit_new import *
class raw_graph:
def __init__(self, funcname, g, func_f):
self.funcname = funcname
self.old_g = g[0]
self.g = nx.DiGraph()
self.entry = g[1]
self.fun_features = func_f
self.attributing()
def __len__(self):
return len(self.g)
def attributing(self):
self.obtainOffsprings(self.old_g)
for node in self.old_g:
fvector = self.retrieveVec(node, self.old_g)
self.g.add_node(node)
self.g.node[node]['v'] = fvector
for edge in self.old_g.edges():
node1 = edge[0]
node2 = edge[1]
self.g.add_edge(node1, node2)
def obtainOffsprings(self,g):
nodes = g.nodes()
for node in nodes:
offsprings = {}
self.getOffsprings(g, node, offsprings)
g.node[node]['offs'] = len(offsprings)
return g
def getOffsprings(self, g, node, offsprings):
node_offs = 0
sucs = g.successors(node)
for suc in sucs:
if suc not in offsprings:
offsprings[suc] = 1
self.getOffsprings(g, suc, offsprings)
def retrieveVec(self, id_, g):
feature_vec = []
#numC0
numc = g.node[id_]['consts']
feature_vec.append(numc)
#nums1
nums = g.node[id_]['strings']
feature_vec.append(nums)
#offsprings2
offs = g.node[id_]['offs']
feature_vec.append(offs)
#numAs3
numAs = g.node[id_]['numAs']
feature_vec.append(numAs)
# of calls4
calls = g.node[id_]['numCalls']
feature_vec.append(calls)
# of insts5
insts = g.node[id_]['numIns']
feature_vec.append(insts)
# of LIs6
insts = g.node[id_]['numLIs']
feature_vec.append(insts)
# of TIs7
insts = g.node[id_]['numTIs']
feature_vec.append(insts)
return feature_vec
def enumerating(self, n):
subgs = []
#pdb.set_trace()
for sub_nodes in itertools.combinations(self.g.nodes(), n):
subg = self.g.subgraph(sub_nodes)
u_subg = subg.to_undirected()
if nx.is_connected(u_subg):
subgs.append(subg)
return subgs
def genMotifs(self, n):
motifs = {}
subgs = enumerating(n)
for subg in subgs:
if len(motifs) == 0:
motifs[subg] = [subg]
else:
nomatch = True
for mt in motifs:
if nx.is_isomorphic(mt, subg):
motifs[mt].append(subg)
nomatch = False
if nomatch:
motifs[subg] = [subg]
return motifs
def enumerating_efficient(self, n):
#pdb.set_trace()
if len(self.g) >= 200:
return []
with open('/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt','wb') as f:
nx.write_edgelist(self.g,f,data=False)
#pdb.set_trace()
process = Popen(["/home/qian/workspace/FANMOD-command_line-source/executables/./fanmod_command_line_linux", str(n), "100000", "1", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt", "1", "0", "0", "2", "0", "0", "0", "1000", "3", "3", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt", "0", "1"], stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()
if process.returncode >= 0:
#os.system("/home/qian/software/FANMOD-command_line-source/executables/./fanmod_command_line_linux " +str(n) + " 100000 1 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt 1 0 0 2 0 0 0 1000 3 3 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt 0 1")
#pdb.set_trace()
#pdb.set_trace()
subgs = self.parseOutput("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump", n)
#pdb.set_trace()
os.remove("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump")
return subgs
return []
def parseOutput(self, path, n):
pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
subgraphs = []
with open(path,'r') as f:
data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
mo = re.findall(pattern, data)
if mo:
results = [map(int, v.split(',')[1:]) for v in mo]
subgraphs = self.createGraphDirectly(results)
return subgraphs
def parseOutputByconditions(self, path, n):
pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
subgraphs = []
with open(path,'r') as f:
data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
mo = re.findall(pattern, data)
if mo:
results = [map(int, v.split(',')[1:]) for v in mo]
subgraphs = self.create_Graphbycondition_Directly(results)
return subgraphs
def create_Graphbycondition_Directly(self, results):
subgs = []
for indexes in results:
tg = template_graph()
subg = self.g.subgraph(indexes)
tg.updateG(subg)
subgs.append(tg)
del tg
return subgs
def createGraphDirectly(self, results):
#pdb.set_trace()
#subgs = [self.g.subgraph(indexes) for indexes in results]
subgs = []
for indexes in results:
tg = template_graph()
subg = self.g.subgraph(indexes)
tg.updateG(subg)
subgs.append(tg)
del tg
return subgs
def createGraph(self, results, n):
binary_value = int(results[0],2)
indexes = [int(v) for v in results[1:]]
fang = self.createG(results[0], n)
if fang:
tg = template_graph(binary_value)
tg.updateG(fang, indexes, self.g)
return tg
pdb.set_trace()
print "there is g which is none"
def createG(self, binary_str, n):
g = nx.DiGraph()
l = [int(v) for v in binary_str]
#pdb.set_trace()
shape = (n, n)
data = np.array(l)
ad_matrix = data.reshape(shape)
for i in xrange(n):
for j in xrange(n):
if ad_matrix[i][j] == 1:
g.add_edge(i, j)
return g
class raw_graphs:
def __init__(self, binary_name):
self.binary_name = binary_name
self.raw_graph_list = []
def append(self, raw_g):
self.raw_graph_list.append(raw_g)
def __len__(self):
return len(self.raw_graph_list)
class graphlets:
def __init__(self, funcname):
self.funcname = funcname
self.graphlets_list = []
self.binary_name = None
def updateBN(self, binary_name):
self.binary_name = binary_name
def append(self, subg):
self.graphlets_list.append(subg)
def appendSet(self, subgs):
self.graphlets_list += subgs
def __len__(self):
return len(self.graphlets_list)
class template_graph:
def __init__(self, value=None):
self.value = value
self.g = None
def updateG(self,g):
self.g = g
#def updateIndexes(self, indexes):
# self.indexes = indexes
#def updateAttributes(self, pg, indexes, maing):
# for id_ in xrange(len(indexes)):
# index = indexes[id_]
# gnode = self.findNode(index, maing)
# self.g.node[gnode] = pg.node[index]
class template_graphs:
def __init__(self, size):
self.size = size
self.gs = []
self.bit_len = None
def enumeratingAll(self):
subgs = []
binary_value = self.genBinValue()
for i in xrange(binary_value):
if i == 0 :
continue
g = self.createG(i)
if g:
tg = template_graph(i)
tg.updateG(g)
self.gs.append(tg)
def genBinValue(self):
n = self.size
self.bit_len = n*n
return 2**(self.bit_len)
def createG(self, i):
g = nx.DiGraph()
l = self.genArray(i)
#pdb.set_trace()
shape = (self.size, self.size)
data = np.array(l)
ad_matrix = data.reshape(shape)
for i in xrange(self.size):
for j in xrange(self.size):
if ad_matrix[i][j] == 1:
g.add_edge(i, j)
u_g = g.to_undirected()
if len(g) == self.size and nx.is_connected(u_g):
return g
return False
def genArray(self, i):
l = [int(x) for x in bin(i)[2:]]
x = [0 for v in xrange(self.bit_len - len(l))]
return x + l

View File

@ -1,356 +0,0 @@
import cPickle as pickle
from search import *
from nearpy import Engine
from nearpy.hashes import RandomDiscretizedProjections
from nearpy.filters import NearestFilter, UniqueFilter
from nearpy.distances import EuclideanDistance
from nearpy.distances import CosineDistance
from nearpy.hashes import RandomBinaryProjections
from nearpy.experiments import DistanceRatioExperiment
from redis import Redis
from nearpy.storage import RedisStorage
from feature import *
import numpy as np
import os
import pdb
import argparse
import time
import numpy as np
from refactoring import *
import pymongo
from pymongo import MongoClient
def initDB():
client = MongoClient()
client = MongoClient('localhost', 27017)
client = MongoClient('mongodb://localhost:27017/')
db = client.test_database
db = client['iot-encoding']
return db
db = initDB()
posts = db.posts
class db:
def __init__(self):
self.feature_list = {}
self.engine = None
def loadHashmap(self, feature_size, result_n):
# Create redis storage adapter
redis_object = Redis(host='localhost', port=6379, db=0)
redis_storage = RedisStorage(redis_object)
pdb.set_trace()
try:
# Get hash config from redis
config = redis_storage.load_hash_configuration('test')
# Config is existing, create hash with None parameters
lshash = RandomBinaryProjections(None, None)
# Apply configuration loaded from redis
lshash.apply_config(config)
except:
# Config is not existing, create hash from scratch, with 10 projections
lshash = RandomBinaryProjections('test', 0)
# Create engine for feature space of 100 dimensions and use our hash.
# This will set the dimension of the lshash only the first time, not when
# using the configuration loaded from redis. Use redis storage to store
# buckets.
nearest = NearestFilter(1000)
#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
pdb.set_trace()
self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
# Do some stuff like indexing or querying with the engine...
# Finally store hash configuration in redis for later use
redis_storage.store_hash_configuration(lshash)
def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
if fvector is None:
return
#ftuple = tuple([fvector])
self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
def batch_appendDB(self, binary_name, features, firmware_name=""):
for funcname in features:
feature = features[funcname]
#pdb.set_trace()
self.appendToDB(binary_name, funcname, feature, firmware_name)
def batch_appendDBbyDir(self, base_dir):
cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
i = 0
for v in cursor:
print i
i+=1
binary_name = v['binary_name']
funcname = v['func_name']
firmware_name = v['firmware_name']
feature = v['fvector']
self.appendToDB(binary_name, funcname, feature, firmware_name)
def batch_appendDBbyDir1(self, base_dir):
image_dir = os.path.join(base_dir, "image")
firmware_featrues={}
bnum = 0
fnum = 0
i = 0
pdb.set_trace()
for firmware_name in os.listdir(image_dir):
print firmware_name
firmware_featrues[firmware_name] = {}
firmware_dir = os.path.join(image_dir, firmware_name)
for binary_name in os.listdir(firmware_dir):
if binary_name.endswith(".features"):
bnum += 1
featrues_dir = os.path.join(firmware_dir, binary_name)
featrues = pickle.load(open(featrues_dir, "r"))
for funcname in featrues:
fnum +=1
#pdb.set_trace()
feature = featrues[funcname]
self.appendToDB(binary_name, funcname, feature, firmware_name)
del featrues
print("bnum ", bnum)
print("fnum ", fnum)
def dump(self, base_dir):
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
pickle.dump(self.feature_list, open(db_dir, 'w'))
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
pickle.dump(self.engine, open(db_dir, 'w'))
def loadDB(self, base_dir):
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
self.feature_list = pickle.load(open(db_dir, 'r'))
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
self.engine = pickle.load(open(db_dir, 'r'))
def findF(self, binary_name, funcname):
x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
return x[0]
def retrieveFeaturesByDir(n, base_dir):
firmware_featrues={}
i = 0
for firmware_name in os.listdir(base_dir):
if firmware_name.endWith(".features"):
firmware_featrues[firmware_name] = {}
firmware_dir = os.path.join(base_dir, firmware_name)
if i > 0:
break
i += 1
pdb.set_trace()
for binary_name in os.listdir(firmware_dir):
featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
for funcname in featrues:
feature = featrues[funcname]
self.appendToDB(firmware_name, binary_name, funcname, feature)
del featrues
def retrieveFeatures(n, base_dir, filename, funcs):
feature_dic = {}
featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
#featuresx = retrieveFeaturesx(filename)
for name in featrues:
#if name in funcs:
x = featrues[name]
#+ featuresx[name]
feature_dic[name] = np.asarray(x)
return feature_dic
def retrieveVuldb(base_input_dir):
vul_path = os.path.join(base_input_dir, "vul")
vul_db = pickle.load(open(vul_path, "r"))
return vul_db
def retrieveFeaturesx(filename):
ida_input_dir = os.path.join("./data/", filename + ".features")
featuresx = pickle.load(open(ida_input_dir, "r"))
return featuresx
def retrieveQueries(n, base_dir, filename1, featrues_src):
queries = {}
featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
#featuresx = retrieveFeaturesx(filename1)
for name in featrues:
#if name in featrues_src:
x = featrues[name]
#+ featuresx[name]
queries[name] = np.asarray(x)
return queries
def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
queries = {}
featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
for name in featrues:
#del featrues[name][5]
queries[name] = np.asarray(featrues[name])
return queries
def retrieveQuery(n, base_dir, filename, funcname):
featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
f = [featrues[v] for v in featrues if funcname in v ][0]
return np.asarray(f)
def parse_command():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
parser.add_argument('--output_dir', type=str, help="output dir")
parser.add_argument("--filename1", type=str, help="the size of each graphlet")
parser.add_argument("--filename2", type=str, help="the size of each graphlet")
parser.add_argument("--size", type=int, help="the size of each graphlet")
#parser.add_argument("--size", type=int, help="the size of each graphlet")
args = parser.parse_args()
return args
def loadFuncs(path):
funcs = {}
x86_dir = os.path.join(path, "func_candid")
#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
fp = open(x86_dir,"r")
for line in fp:
items = line.split("\n")
funcname = items[0]
funcs[funcname] = 1
return funcs
def dump(path, featrues, queries):
fp = open(path + "/" + "matrix", 'w')
for name in featrues:
row = []
row.append("x86")
row.append(name)
row += featrues[name]
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
for name in queries:
row = []
row.append("mips")
row.append(name)
row += queries[name]
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
fp.close()
def queryBytwo(base_input_dir, filename1, filename2, n):
threthold = 50
db_instance = db()
funcs = loadFuncs(base_input_dir)
db_instance.loadHashmap(n, 50000)
#pdb.set_trace()
featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
queries = retrieveQueries(n, base_input_dir, filename2, funcs)
#queries = refactoring(queries, featrues)
vul_db = retrieveVuldb(base_input_dir)
pdb.set_trace()
#dump(base_input_dir, featrues, queries)
#start = time.time()
#db_instance.batch_appendDBbyDir(base_input_dir)
#end = time.time()
#total = end - start
#print total
db_instance.batch_appendDB(filename1, featrues)
pdb.set_trace()
ranks = []
times = []
for threthold in xrange(1, 210, 10):
hit = []
i = 0
for name in queries:
#print i
i += 1
'''
if i == 1000:
print (sum(times)/len(times))
pdb.set_trace()
print "s"
'''
#if name not in vul_db['openssl']:
# continue
if name not in featrues:
continue
#pdb.set_trace()
query = queries[name]
#start = time.time()
x = db_instance.engine.neighbours(query)
#end = time.time()
#total = end - start
#times.append(total)
#print total
#pdb.set_trace()
try:
rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
ranks.append((name, rank))
if rank <= threthold:
hit.append(1)
else:
hit.append(0)
except:
#pdb.set_trace()
hit.append(0)
pass
#pdb.set_trace()
acc = sum(hit) * 1.0 / len(hit)
print acc
def queryAll(base_dir, firmware_name, filename1, n):
threthold = 155
db_instance = db()
db_instance.loadHashmap(n, 50000)
queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
start = time.time()
pdb.set_trace()
db_instance.batch_appendDBbyDir(n, base_dir)
end = time.time()
dur = end - start
print dur
pdb.set_trace()
hit = []
i = 0
times = []
for name in queries:
print i
i += 1
query = queries[name]
start = time.clock()
x = db_instance.engine.neighbours(query)
end = time.clock()
dur = end - start
times.append(dur)
#pdb.set_trace()
try:
rank = [v for v in xrange(len(x)) if name in x[v][1]]
if len(rank) > 1:
pdb.set_trace()
print "stop"
if rank[0] <= threthold:
hit.append(1)
else:
hit.append(0)
except:
hit.append(0)
acc = sum(hit) * 1.0 / len(hit)
mean = np.mean(times)
std = np.std(times)
#pdb.set_trace()
print acc
if __name__ == "__main__":
args = parse_command()
base_dir = args.base_input_dir
filename1 = args.filename1
filename2 = args.filename2
n = args.size
pdb.set_trace()
queryBytwo(base_dir, filename1, filename2, n)