diff --git a/README.md b/README.md index e69de29..55d5dd1 100644 --- a/README.md +++ b/README.md @@ -0,0 +1 @@ +This is a test! diff --git a/raw-feature-extractor/cfg_constructor.py b/raw-feature-extractor/cfg_constructor.py new file mode 100755 index 0000000..9bd3bdd --- /dev/null +++ b/raw-feature-extractor/cfg_constructor.py @@ -0,0 +1,286 @@ +import copy +import networkx as nx +from idautils import * +from idaapi import * +from idc import * + +import copy +import networkx as nx +from idautils import * +from idaapi import * +from idc import * +from graph_analysis_ida import * + + +def getCfg(func, externs_eas, ea_externs): + func_start = func.startEA + func_end = func.endEA + cfg = nx.DiGraph() + control_blocks, main_blocks = obtain_block_sequence(func) + i = 0 + visited = {} + start_node = None + for bl in control_blocks: + start = control_blocks[bl][0] + end = control_blocks[bl][1] + src_node = (start, end) + if src_node not in visited: + src_id = len(cfg) + visited[src_node] = src_id + cfg.add_node(src_id) + cfg.node[src_id]['label'] = src_node + else: + src_id = visited[src_node] + + #if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp': + if start == func_start: + cfg.node[src_id]['c'] = "start" + start_node = src_node + if end == func_end: + cfg.node[src_id]['c'] = "end" + #print control_ea, 1 + refs = CodeRefsTo(start, 0) + for ref in refs: + if ref in control_blocks: + dst_node = control_blocks[ref] + if dst_node not in visited: + visited[dst_node] = len(cfg) + dst_id = visited[dst_node] + cfg.add_edge(dst_id, src_id) + cfg.node[dst_id]['label'] = dst_node + #print control_ea, 1 + refs = CodeRefsTo(start, 1) + for ref in refs: + if ref in control_blocks: + dst_node = control_blocks[ref] + if dst_node not in visited: + visited[dst_node] = len(cfg) + dst_id = visited[dst_node] + cfg.add_edge(dst_id, src_id) + cfg.node[dst_id]['label'] = dst_node + #print "attributing" + attributingRe(cfg, externs_eas, ea_externs) + # removing deadnodes + #old_cfg = copy.deepcopy(cfg) + #transform(cfg) + return cfg, 0 + +def transform(cfg): + merging(cfg) + filtering(cfg) + +def merging(cfg): + bb_ids = cfg.nodes() + for bb_id in bb_ids: + try: + bb = cfg.node[bb_id]['label'] + bb_start = bb[0] + bb_end = bb[1] + succs = cfg.successors(bb_id) + #preds = cfg.predecessors(bb_id) + if len(succs) == 1: + preds = cfg.predecessors(succs[0]) + if len(preds) == 1: + domerge(cfg, bb_id, succs[0]) + except: + pass + +def domerge(cfg, bb_id, suc_node): + suc_nodes = cfg.successors(suc_node) + for node in suc_nodes: + cfg.add_edge(bb_id, node) + cfg.remove_node(suc_node) + + +def filtering(cfg): + rm_sets = [] + for bb_id in cfg: + bb = cfg.node[bb_id]['label'] + bb_start = bb[0] + bb_end = bb[1] + re = remove(bb_start, bb_end) + print bb_id, re, bb_start, bb_end + if re: + print re, bb_id + rm_sets.append(bb_id) + print rm_sets + for bb_id in rm_sets: + cfg.remove_node(bb_id) + +def remove(bb_start, bb_end): + seqs = getSequences(bb_start, bb_end) + if matchseq(seqs): + return True + return False + +def matchseq(seqs): + mips = set(['lw', "jr", "addiu"]) + x86 = set(['add', 'pop', 'retn']) + b_mips = set(['b', ('move','$v0')]) + b_x86 = set(['b', ('mov','$eax')]) + re_mips = set([('move','$v0')]) + re_x86 = set([('mov','$eax')]) + diff_mips = set(seqs).difference(set(mips)) + if len(diff_mips) == 0: + return True + diff_x86 = set(seqs).difference(set(x86)) + if len(diff_x86) == 0: + return True + if set(seqs) == b_mips: + return True + if set(seqs) == b_x86: + return True + if set(seqs) == re_mips: + return True + if set(seqs) == re_x86: + return True + return False + +def attributingRe(cfg, externs_eas, ea_externs): + for node_id in cfg: + bl = cfg.node[node_id]['label'] + numIns = calInsts(bl) + cfg.node[node_id]['numIns'] = numIns + numCalls = calCalls(bl) + cfg.node[node_id]['numCalls'] = numCalls + numLIs = calLogicInstructions(bl) + cfg.node[node_id]['numLIs'] = numLIs + numAs = calArithmeticIns(bl) + cfg.node[node_id]['numAs'] = numAs + strings, consts = getBBconsts(bl) + cfg.node[node_id]['numNc'] = len(strings) + len(consts) + cfg.node[node_id]['consts'] = consts + cfg.node[node_id]['strings'] = strings + externs = retrieveExterns(bl, ea_externs) + cfg.node[node_id]['externs'] = externs + numTIs = calTransferIns(bl) + cfg.node[node_id]['numTIs'] = numTIs + + +def attributing(cfg): + ga = graph_analysis() + ga.gwithoffspring(cfg) + print "finishing offspring" + for node in cfg: + stmt_num = getStmtNum(node) + binary_value = getBinaryValue(node) + cfg.node[node]['stmt_num'] = stmt_num + cfg.node[node]['binary_value'] = binary_value + ga.domChecking(cfg) + print "finishing domChecking" + ga.loopChecking(cfg) + print "finishing loopChecking" + + +def getStmtNum(node): + start = node[0] + end = node[1] + stmt_num = 0 + inst_addr = start + while inst_addr < end: + inst_addr = NextHead(inst_addr) + stmt_num += 1 + return stmt_num + +def getBinaryValue(node): + start = node[0] + inst_addr = NextHead(start) + value = 0 + addr = 0 + for x in xrange((inst_addr - start)-1): + addr = start + x + y = GetOriginalByte(addr) + print value, addr, y + value = value | y + value = value << 8 + print value + + addr = inst_addr - 1 + y = GetOriginalByte(addr) + print value, addr, y + value = value | y + print node + print bin(value) + return value + + +def cfg_construct(func): + func_start = func.startEA + func_end = func.endEA + cfg = nx.DiGraph() + seq_blocks, main_blocks = obtain_block_sequence(func) + i = 0 + visited = {} + for bl in seq_blocks: + start = seq_blocks[bl][0] + end = seq_blocks[bl][1] + src_node = (start, end) + if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp': + next_start = seq_blocks[end][0] + next_end = seq_blocks[end][1] + next_node = (next_start, next_end) + cfg.add_edge(src_node, next_node) + if start == func_start: + cfg.add_node(src_node, c='start') + start_node = src_node + if end == func_end: + cfg.add_node(src_node, c='end') + refs = CodeRefsFrom(PrevHead(end), 0) + + for ref in refs: + #print ref + if ref in seq_blocks: + dst_node = (seq_blocks[ref][0], seq_blocks[ref][1]) + cfg.add_edge(src_node, dst_node) + return cfg, start_node + + +def obtain_allpaths( cfg, node, path, allpaths): + path.append(node) + if 'c' in cfg.node[node] and cfg.node[node]['c'] == 'end': + allpaths.append(path) + return + else: + for suc in cfg.successors(node): + if suc not in path: + path_copy = copy.copy(path) + obtain_allpaths(cfg, suc, path_copy, allpaths) + + +def obtain_block_sequence(func): + control_blocks = {} + main_blocks = {} + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + for bl in blocks: + base = bl[0] + end = PrevHead(bl[1]) + control_ea = checkCB(bl) + control_blocks[control_ea] = bl + control_blocks[end] = bl + if func.startEA <= base <= func.endEA: + main_blocks[base] = bl + x = sorted(main_blocks) + return control_blocks, x + +def checkCB(bl): + start = bl[0] + end = bl[1] + ea = start + while ea < end: + if checkCondition(ea): + return ea + ea = NextHead(ea) + + return PrevHead(end) + +def checkCondition(ea): + mips_branch = {"beqz":1, "beq":1, "bne":1, "bgez":1, "b":1, "bnez":1, "bgtz":1, "bltz":1, "blez":1, "bgt":1, "bge":1, "blt":1, "ble":1, "bgtu":1, "bgeu":1, "bltu":1, "bleu":1} + x86_branch = {"jz":1, "jnb":1, "jne":1, "je":1, "jg":1, "jle":1, "jl":1, "jge":1, "ja":1, "jae":1, "jb":1, "jbe":1, "jo":1, "jno":1, "js":1, "jns":1} + arm_branch = {"B":1, "BAL":1, "BNE":1, "BEQ":1, "BPL":1, "BMI":1, "BCC":1, "BLO":1, "BCS":1, "BHS":1, "BVC":1, "BVS":1, "BGT":1, "BGE":1, "BLT":1, "BLE":1, "BHI":1 ,"BLS":1 } + conds = {} + conds.update(mips_branch) + conds.update(x86_branch) + opcode = GetMnem(ea) + if opcode in conds: + return True + return False diff --git a/raw-feature-extractor/func.py b/raw-feature-extractor/func.py new file mode 100755 index 0000000..b68026b --- /dev/null +++ b/raw-feature-extractor/func.py @@ -0,0 +1,284 @@ +# +# Reference Lister +# +# List all functions and all references to them in the current section. +# +# Implemented with the idautils module +# +from idautils import * +from idaapi import * +from idc import * +import networkx as nx +import cfg_constructor as cfg +import cPickle as pickle +import pdb +from raw_graphs import * +from discovRe_feature.discovRe import * +#import wingdbstub +#wingdbstub.Ensure() +def gt_funcNames(ea): + funcs = [] + plt_func, plt_data = processpltSegs() + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + if funcname in plt_func: + print funcname + continue + funcs.append(funcname) + return funcs + +def get_funcs(ea): + funcs = {} + # Get current ea + # Loop from start to end in the current segment + plt_func, plt_data = processpltSegs() + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + if funcname in plt_func: + continue + func = get_func(funcea) + blocks = FlowChart(func) + funcs[funcname] = [] + for bl in blocks: + start = bl.startEA + end = bl.endEA + funcs[funcname].append((start, end)) + return funcs + +# used for the callgraph generation. +def get_func_namesWithoutE(ea): + funcs = {} + plt_func, plt_data = processpltSegs() + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + if 'close' in funcname: + print funcea + if funcname in plt_func: + print funcname + continue + funcs[funcname] = funcea + return funcs + +# used for the callgraph generation. +def get_func_names(ea): + funcs = {} + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + funcs[funcname] = funcea + return funcs + +def get_func_bases(ea): + funcs = {} + plt_func, plt_data = processpltSegs() + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + if funcname in plt_func: + continue + funcs[funcea] = funcname + return funcs + +def get_func_range(ea): + funcs = {} + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + func = get_func(funcea) + funcs[funcname] = (func.startEA, func.endEA) + return funcs + +def get_unified_funcname(ea): + funcname = GetFunctionName(ea) + if len(funcname) > 0: + if '.' == funcname[0]: + funcname = funcname[1:] + return funcname + +def get_func_sequences(ea): + funcs_bodylist = {} + funcs = get_funcs(ea) + for funcname in funcs: + if funcname not in funcs_bodylist: + funcs_bodylist[funcname] = [] + for start, end in funcs[funcname]: + inst_addr = start + while inst_addr <= end: + opcode = GetMnem(inst_addr) + funcs_bodylist[funcname].append(opcode) + inst_addr = NextHead(inst_addr) + return funcs_bodylist + +def get_func_cfgs_c(ea): + binary_name = idc.GetInputFile() + raw_cfgs = raw_graphs(binary_name) + externs_eas, ea_externs = processpltSegs() + i = 0 + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + func = get_func(funcea) + print i + i += 1 + icfg = cfg.getCfg(func, externs_eas, ea_externs) + func_f = get_discoverRe_feature(func, icfg[0]) + raw_g = raw_graph(funcname, icfg, func_f) + raw_cfgs.append(raw_g) + + return raw_cfgs + +def get_func_cfgs_ctest(ea): + binary_name = idc.GetInputFile() + raw_cfgs = raw_graphs(binary_name) + externs_eas, ea_externs = processpltSegs() + i = 0 + diffs = {} + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + func = get_func(funcea) + print i + i += 1 + icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs) + diffs[funcname] = (icfg, old_cfg) + #raw_g = raw_graph(funcname, icfg) + #raw_cfgs.append(raw_g) + + return diffs + +def get_func_cfgs(ea): + func_cfglist = {} + i = 0 + for funcea in Functions(SegStart(ea)): + funcname = get_unified_funcname(funcea) + func = get_func(funcea) + print i + i += 1 + try: + icfg = cfg.getCfg(func) + func_cfglist[funcname] = icfg + except: + pass + + return func_cfglist + +def get_func_cfg_sequences(func_cfglist): + func_cfg_seqlist = {} + for funcname in func_cfglist: + func_cfg_seqlist[funcname] = {} + cfg = func_cfglist[funcname][0] + for start, end in cfg: + codesq = get_sequences(start, end) + func_cfg_seqlist[funcname][(start,end)] = codesq + + return func_cfg_seqlist + + +def get_sequences(start, end): + seq = [] + inst_addr = start + while inst_addr <= end: + opcode = GetMnem(inst_addr) + seq.append(opcode) + inst_addr = NextHead(inst_addr) + return seq + +def get_stack_arg(func_addr): + print func_addr + args = [] + stack = GetFrame(func_addr) + if not stack: + return [] + firstM = GetFirstMember(stack) + lastM = GetLastMember(stack) + i = firstM + while i <=lastM: + mName = GetMemberName(stack,i) + mSize = GetMemberSize(stack,i) + if mSize: + i = i + mSize + else: + i = i+4 + if mName not in args and mName and ' s' not in mName and ' r' not in mName: + args.append(mName) + return args + + #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) + +def processExternalSegs(): + funcdata = {} + datafunc = {} + for n in xrange(idaapi.get_segm_qty()): + seg = idaapi.getnseg(n) + ea = seg.startEA + segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) + if segtype in [idc.SEG_XTRN]: + start = idc.SegStart(ea) + end = idc.SegEnd(ea) + cur = start + while cur <= end: + name = get_unified_funcname(cur) + funcdata[name] = hex(cur) + cur = NextHead(cur) + return funcdata + +def processpltSegs(): + funcdata = {} + datafunc = {} + for n in xrange(idaapi.get_segm_qty()): + seg = idaapi.getnseg(n) + ea = seg.startEA + segname = SegName(ea) + if segname in ['.plt', 'extern', '.MIPS.stubs']: + start = seg.startEA + end = seg.endEA + cur = start + while cur < end: + name = get_unified_funcname(cur) + funcdata[name] = hex(cur) + datafunc[cur]= name + cur = NextHead(cur) + return funcdata, datafunc + + +def processDataSegs(): + funcdata = {} + datafunc = {} + for n in xrange(idaapi.get_segm_qty()): + seg = idaapi.getnseg(n) + ea = seg.startEA + segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) + if segtype in [idc.SEG_DATA, idc.SEG_BSS]: + start = idc.SegStart(ea) + end = idc.SegEnd(ea) + cur = start + while cur <= end: + refs = [v for v in DataRefsTo(cur)] + for fea in refs: + name = get_unified_funcname(fea) + if len(name)== 0: + continue + if name not in funcdata: + funcdata[name] = [cur] + else: + funcdata[name].append(cur) + if cur not in datafunc: + datafunc[cur] = [name] + else: + datafunc[cur].append(name) + cur = NextHead(cur) + return funcdata, datafunc + +def obtainDataRefs(callgraph): + datarefs = {} + funcdata, datafunc = processDataSegs() + for node in callgraph: + if node in funcdata: + datas = funcdata[node] + for dd in datas: + refs = datafunc[dd] + refs = list(set(refs)) + if node in datarefs: + print refs + datarefs[node] += refs + datarefs[node] = list(set(datarefs[node])) + else: + datarefs[node] = refs + return datarefs + + diff --git a/raw-feature-extractor/graph_analysis_ida.py b/raw-feature-extractor/graph_analysis_ida.py new file mode 100644 index 0000000..3d82bf0 --- /dev/null +++ b/raw-feature-extractor/graph_analysis_ida.py @@ -0,0 +1,156 @@ +from idautils import * +from idaapi import * +from idc import * + +def getSequences(start, end): + seqs = [] + inst_addr = start + while inst_addr < end: + opcode = GetMnem(inst_addr) + if opcode == 'move' or opcode == "mov": + opnd1 = GetOpnd(inst_addr,0) + if opnd1 == '$v0' or opnd1 == "$eax": + opcode = (opcode, opnd1) + seqs.append(opcode) + inst_addr = NextHead(inst_addr) + return seqs + +def calArithmeticIns(bl): + x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1} + mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1} + arm_AI = {"ADD":1, "ADC":1, "SUB":1, "SBC":1, "RSB":1, "RSC":1, "MUL":1, "MLA":1} + calls = {} + calls.update(x86_AI) + calls.update(mips_AI) + start = bl[0] + end = bl[1] + invoke_num = 0 + inst_addr = start + while inst_addr < end: + opcode = GetMnem(inst_addr) + re = [v for v in calls if opcode in v] + if len(re) > 0: + invoke_num += 1 + inst_addr = NextHead(inst_addr) + return invoke_num + +def calCalls(bl): + calls = {'call':1, 'jal':1, 'jalr':1, "BL":1} + start = bl[0] + end = bl[1] + invoke_num = 0 + inst_addr = start + while inst_addr < end: + opcode = GetMnem(inst_addr) + re = [v for v in calls if opcode in v] + if len(re) > 0: + invoke_num += 1 + inst_addr = NextHead(inst_addr) + return invoke_num + +def calInsts(bl): + start = bl[0] + end = bl[1] + ea = start + num = 0 + while ea < end: + num += 1 + ea = NextHead(ea) + return num + +def calLogicInstructions(bl): + x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1} + mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1} + arm_LI = {"AND":1, "EOR":1, "ORR":1, "ORN":1, 'BIC':1} + calls = {} + calls.update(x86_LI) + calls.update(mips_LI) + calls.update(arm_LI) + start = bl[0] + end = bl[1] + invoke_num = 0 + inst_addr = start + while inst_addr < end: + opcode = GetMnem(inst_addr) + re = [v for v in calls if opcode in v] + if len(re) > 0: + invoke_num += 1 + inst_addr = NextHead(inst_addr) + return invoke_num + +def calSconstants(bl): + start = bl[0] + end = bl[1] + invoke_num = 0 + inst_addr = start + while inst_addr < end: + opcode = GetMnem(inst_addr) + if opcode in calls: + invoke_num += 1 + inst_addr = NextHead(inst_addr) + return invoke_num + +def getConst(ea, offset): + strings = [] + consts = [] + optype1 = GetOpType(ea, offset) + if optype1 == idaapi.o_imm: + imm_value = GetOperandValue(ea, offset) + if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value): + str_value = GetString(imm_value) + strings.append(str_value) + else: + consts.append(imm_value) + return strings, consts + +def getBBconsts(bl): + strings = [] + consts = [] + start = bl[0] + end = bl[1] + invoke_num = 0 + inst_addr = start + while inst_addr < end: + strings_src, consts_src = getConst(inst_addr, 0) + strings_dst, consts_dst = getConst(inst_addr, 1) + strings += strings_src + strings += strings_dst + consts += consts_src + consts += consts_dst + inst_addr = NextHead(inst_addr) + return strings, consts + +def retrieveExterns(bl, ea_externs): + externs = [] + start = bl[0] + end = bl[1] + inst_addr = start + while inst_addr < end: + refs = CodeRefsFrom(inst_addr, 1) + try: + ea = [v for v in refs if v in ea_externs][0] + externs.append(ea_externs[ea]) + except: + pass + inst_addr = NextHead(inst_addr) + return externs + +def calTransferIns(bl): + x86_TI = {'jmp':1, 'jz':1, 'jnz':1, 'js':1, 'je':1, 'jne':1, 'jg':1, 'jle':1, 'jge':1, 'ja':1, 'jnc':1, 'call':1} + mips_TI = {'beq':1, 'bne':1, 'bgtz':1, "bltz":1, "bgez":1, "blez":1, 'j':1, 'jal':1, 'jr':1, 'jalr':1} + arm_TI = {'MVN':1, "MOV":1} + calls = {} + calls.update(x86_TI) + calls.update(mips_TI) + calls.update(arm_TI) + start = bl[0] + end = bl[1] + invoke_num = 0 + inst_addr = start + while inst_addr < end: + opcode = GetMnem(inst_addr) + re = [v for v in calls if opcode in v] + if len(re) > 0: + invoke_num += 1 + inst_addr = NextHead(inst_addr) + return invoke_num \ No newline at end of file diff --git a/raw-feature-extractor/preprocessing_ida.py b/raw-feature-extractor/preprocessing_ida.py new file mode 100644 index 0000000..6704223 --- /dev/null +++ b/raw-feature-extractor/preprocessing_ida.py @@ -0,0 +1,27 @@ +from func import * +from raw_graphs import * +from idc import * +import os +import argparse + +def parse_command(): + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file") + args = parser.parse_args() + return args + +if __name__ == '__main__': + + args = parse_command() + path = args.path + analysis_flags = idc.GetShortPrm(idc.INF_START_AF) + analysis_flags &= ~idc.AF_IMMOFF + # turn off "automatically make offset" heuristic + idc.SetShortPrm(idc.INF_START_AF, analysis_flags) + idaapi.autoWait() + cfgs = get_func_cfgs_c(FirstSeg()) + binary_name = idc.GetInputFile() + '.ida' + fullpath = os.path.join(path, binary_name) + pickle.dump(cfgs, open(fullpath,'w')) + print binary_name + idc.Exit(0) \ No newline at end of file diff --git a/raw-feature-extractor/raw_graphs.py b/raw-feature-extractor/raw_graphs.py new file mode 100755 index 0000000..37bcc5d --- /dev/null +++ b/raw-feature-extractor/raw_graphs.py @@ -0,0 +1,286 @@ +import itertools +import sys +sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/') +import networkx as nx +#import numpy as np +from subprocess import Popen, PIPE +import pdb +import os +import re,mmap +#from graph_edit_new import * + +class raw_graph: + def __init__(self, funcname, g, func_f): + self.funcname = funcname + self.old_g = g[0] + self.g = nx.DiGraph() + self.entry = g[1] + self.fun_features = func_f + self.attributing() + + def __len__(self): + return len(self.g) + + def attributing(self): + self.obtainOffsprings(self.old_g) + for node in self.old_g: + fvector = self.retrieveVec(node, self.old_g) + self.g.add_node(node) + self.g.node[node]['v'] = fvector + + for edge in self.old_g.edges(): + node1 = edge[0] + node2 = edge[1] + self.g.add_edge(node1, node2) + + def obtainOffsprings(self,g): + nodes = g.nodes() + for node in nodes: + offsprings = {} + self.getOffsprings(g, node, offsprings) + g.node[node]['offs'] = len(offsprings) + return g + + def getOffsprings(self, g, node, offsprings): + node_offs = 0 + sucs = g.successors(node) + for suc in sucs: + if suc not in offsprings: + offsprings[suc] = 1 + self.getOffsprings(g, suc, offsprings) + + def retrieveVec(self, id_, g): + feature_vec = [] + #numC0 + numc = g.node[id_]['consts'] + feature_vec.append(numc) + #nums1 + nums = g.node[id_]['strings'] + feature_vec.append(nums) + #offsprings2 + offs = g.node[id_]['offs'] + feature_vec.append(offs) + #numAs3 + numAs = g.node[id_]['numAs'] + feature_vec.append(numAs) + # of calls4 + calls = g.node[id_]['numCalls'] + feature_vec.append(calls) + # of insts5 + insts = g.node[id_]['numIns'] + feature_vec.append(insts) + # of LIs6 + insts = g.node[id_]['numLIs'] + feature_vec.append(insts) + # of TIs7 + insts = g.node[id_]['numTIs'] + feature_vec.append(insts) + return feature_vec + + + def enumerating(self, n): + subgs = [] + #pdb.set_trace() + for sub_nodes in itertools.combinations(self.g.nodes(), n): + subg = self.g.subgraph(sub_nodes) + u_subg = subg.to_undirected() + if nx.is_connected(u_subg): + subgs.append(subg) + return subgs + + + def genMotifs(self, n): + motifs = {} + subgs = enumerating(n) + for subg in subgs: + if len(motifs) == 0: + motifs[subg] = [subg] + else: + nomatch = True + for mt in motifs: + if nx.is_isomorphic(mt, subg): + motifs[mt].append(subg) + nomatch = False + if nomatch: + motifs[subg] = [subg] + return motifs + + def enumerating_efficient(self, n): + #pdb.set_trace() + if len(self.g) >= 200: + return [] + with open('/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt','wb') as f: + nx.write_edgelist(self.g,f,data=False) + #pdb.set_trace() + process = Popen(["/home/qian/workspace/FANMOD-command_line-source/executables/./fanmod_command_line_linux", str(n), "100000", "1", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt", "1", "0", "0", "2", "0", "0", "0", "1000", "3", "3", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt", "0", "1"], stdout=PIPE, stderr=PIPE) + stdout, stderr = process.communicate() + if process.returncode >= 0: + #os.system("/home/qian/software/FANMOD-command_line-source/executables/./fanmod_command_line_linux " +str(n) + " 100000 1 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt 1 0 0 2 0 0 0 1000 3 3 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt 0 1") + #pdb.set_trace() + #pdb.set_trace() + subgs = self.parseOutput("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump", n) + #pdb.set_trace() + os.remove("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump") + return subgs + return [] + + def parseOutput(self, path, n): + pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+') + subgraphs = [] + with open(path,'r') as f: + data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) + mo = re.findall(pattern, data) + if mo: + results = [map(int, v.split(',')[1:]) for v in mo] + subgraphs = self.createGraphDirectly(results) + return subgraphs + + def parseOutputByconditions(self, path, n): + pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+') + subgraphs = [] + with open(path,'r') as f: + data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) + mo = re.findall(pattern, data) + if mo: + results = [map(int, v.split(',')[1:]) for v in mo] + subgraphs = self.create_Graphbycondition_Directly(results) + return subgraphs + + def create_Graphbycondition_Directly(self, results): + subgs = [] + for indexes in results: + tg = template_graph() + subg = self.g.subgraph(indexes) + tg.updateG(subg) + subgs.append(tg) + del tg + return subgs + + def createGraphDirectly(self, results): + #pdb.set_trace() + #subgs = [self.g.subgraph(indexes) for indexes in results] + subgs = [] + for indexes in results: + tg = template_graph() + subg = self.g.subgraph(indexes) + tg.updateG(subg) + subgs.append(tg) + del tg + return subgs + + def createGraph(self, results, n): + binary_value = int(results[0],2) + indexes = [int(v) for v in results[1:]] + fang = self.createG(results[0], n) + if fang: + tg = template_graph(binary_value) + tg.updateG(fang, indexes, self.g) + return tg + pdb.set_trace() + print "there is g which is none" + + def createG(self, binary_str, n): + g = nx.DiGraph() + l = [int(v) for v in binary_str] + #pdb.set_trace() + shape = (n, n) + data = np.array(l) + ad_matrix = data.reshape(shape) + for i in xrange(n): + for j in xrange(n): + if ad_matrix[i][j] == 1: + g.add_edge(i, j) + return g + + + +class raw_graphs: + def __init__(self, binary_name): + self.binary_name = binary_name + self.raw_graph_list = [] + + def append(self, raw_g): + self.raw_graph_list.append(raw_g) + + def __len__(self): + return len(self.raw_graph_list) + + +class graphlets: + def __init__(self, funcname): + self.funcname = funcname + self.graphlets_list = [] + self.binary_name = None + + def updateBN(self, binary_name): + self.binary_name = binary_name + + def append(self, subg): + self.graphlets_list.append(subg) + + def appendSet(self, subgs): + self.graphlets_list += subgs + + def __len__(self): + return len(self.graphlets_list) + +class template_graph: + def __init__(self, value=None): + self.value = value + self.g = None + + def updateG(self,g): + self.g = g + #def updateIndexes(self, indexes): + # self.indexes = indexes + + #def updateAttributes(self, pg, indexes, maing): + # for id_ in xrange(len(indexes)): + # index = indexes[id_] + # gnode = self.findNode(index, maing) + # self.g.node[gnode] = pg.node[index] + + +class template_graphs: + def __init__(self, size): + self.size = size + self.gs = [] + self.bit_len = None + + def enumeratingAll(self): + subgs = [] + binary_value = self.genBinValue() + for i in xrange(binary_value): + if i == 0 : + continue + g = self.createG(i) + if g: + tg = template_graph(i) + tg.updateG(g) + self.gs.append(tg) + + def genBinValue(self): + n = self.size + self.bit_len = n*n + return 2**(self.bit_len) + + def createG(self, i): + g = nx.DiGraph() + l = self.genArray(i) + #pdb.set_trace() + shape = (self.size, self.size) + data = np.array(l) + ad_matrix = data.reshape(shape) + for i in xrange(self.size): + for j in xrange(self.size): + if ad_matrix[i][j] == 1: + g.add_edge(i, j) + u_g = g.to_undirected() + if len(g) == self.size and nx.is_connected(u_g): + return g + return False + + def genArray(self, i): + l = [int(x) for x in bin(i)[2:]] + x = [0 for v in xrange(self.bit_len - len(l))] + return x + l