diff --git a/raw-feature-extractor/discovRe.py b/raw-feature-extractor/discovRe.py new file mode 100755 index 0000000..997edfb --- /dev/null +++ b/raw-feature-extractor/discovRe.py @@ -0,0 +1,228 @@ +# +# Reference Lister +# +# List all functions and all references to them in the current section. +# +# Implemented with the idautils module +# +import networkx as nx +import cPickle as pickle +import pdb +from graph_analysis_ida import * +from graph_property import * +#import wingdbstub +#wingdbstub.Ensure() + +def get_funcs(ea): + funcs = {} + # Get current ea + # Loop from start to end in the current segment + for funcea in Functions(SegStart(ea)): + funcname = GetFunctionName(funcea) + func = get_func(funcea) + blocks = FlowChart(func) + funcs[funcname] = [] + for bl in blocks: + start = bl.startEA + end = bl.endEA + funcs[funcname].append((start, end)) + return funcs + +def get_funcs_for_discoverRe(ea): + features = {} + for funcea in Functions(SegStart(ea)): + funcname = GetFunctionName(funcea) + print funcname + func = get_func(funcea) + feature = get_discoverRe_feature(func) + features[funcname] = feature + return features + +def get_discoverRe_feature(func, icfg): + start = func.startEA + end = func.endEA + features = [] + FunctionCalls = getFuncCalls(func) + #1 + features.append(FunctionCalls) + LogicInstr = getLogicInsts(func) + #2 + features.append(LogicInstr) + Transfer = getTransferInsts(func) + #3 + features.append(Transfer) + Locals = getLocalVariables(func) + #4 + features.append(Locals) + BB = getBasicBlocks(func) + #5 + features.append(BB) + Edges = len(icfg.edges()) + #6 + features.append(Edges) + Incoming = getIncommingCalls(func) + #7 + features.append(Incoming) + #8 + Instrs = getIntrs(func) + features.append(Instrs) + between = retrieveGP(icfg) + #9 + features.append(between) + + strings, consts = getfunc_consts(func) + features.append(strings) + features.append(consts) + return features + +def get_func_names(ea): + funcs = {} + for funcea in Functions(SegStart(ea)): + funcname = GetFunctionName(funcea) + funcs[funcname] = funcea + return funcs + +def get_func_bases(ea): + funcs = {} + for funcea in Functions(SegStart(ea)): + funcname = GetFunctionName(funcea) + funcs[funcea] = funcname + return funcs + +def get_func_range(ea): + funcs = {} + for funcea in Functions(SegStart(ea)): + funcname = GetFunctionName(funcea) + func = get_func(funcea) + funcs[funcname] = (func.startEA, func.endEA) + return funcs + +def get_func_sequences(ea): + funcs_bodylist = {} + funcs = get_funcs(ea) + for funcname in funcs: + if funcname not in funcs_bodylist: + funcs_bodylist[funcname] = [] + for start, end in funcs[funcname]: + inst_addr = start + while inst_addr <= end: + opcode = GetMnem(inst_addr) + funcs_bodylist[funcname].append(opcode) + inst_addr = NextHead(inst_addr) + return funcs_bodylist + +def get_func_cfgs(ea): + func_cfglist = {} + i = 0 + start, end = get_section('LOAD') + #print start, end + for funcea in Functions(SegStart(ea)): + if start <= funcea <= end: + funcname = GetFunctionName(funcea) + func = get_func(funcea) + print i + i += 1 + try: + icfg = cfg.cfg_construct(func) + func_cfglist[funcname] = icfg + except: + pass + + return func_cfglist + +def get_section(t): + base = SegByName(t) + start = SegByBase(base) + end = SegEnd(start) + return start, end + + +def get_func_cfg_sequences(func_cfglist): + func_cfg_seqlist = {} + for funcname in func_cfglist: + func_cfg_seqlist[funcname] = {} + cfg = func_cfglist[funcname][0] + for start, end in cfg: + codesq = get_sequences(start, end) + func_cfg_seqlist[funcname][(start,end)] = codesq + + return func_cfg_seqlist + + +def get_sequences(start, end): + seq = [] + inst_addr = start + while inst_addr <= end: + opcode = GetMnem(inst_addr) + seq.append(opcode) + inst_addr = NextHead(inst_addr) + return seq + +def get_stack_arg(func_addr): + print func_addr + args = [] + stack = GetFrame(func_addr) + if not stack: + return [] + firstM = GetFirstMember(stack) + lastM = GetLastMember(stack) + i = firstM + while i <=lastM: + mName = GetMemberName(stack,i) + mSize = GetMemberSize(stack,i) + if mSize: + i = i + mSize + else: + i = i+4 + if mName not in args and mName and ' s' not in mName and ' r' not in mName: + args.append(mName) + return args + + #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) + +def processDataSegs(): + funcdata = {} + datafunc = {} + for n in xrange(idaapi.get_segm_qty()): + seg = idaapi.getnseg(n) + ea = seg.startEA + segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) + if segtype in [idc.SEG_DATA, idc.SEG_BSS]: + start = idc.SegStart(ea) + end = idc.SegEnd(ea) + cur = start + while cur <= end: + refs = [v for v in DataRefsTo(cur)] + for fea in refs: + name = GetFunctionName(fea) + if len(name)== 0: + continue + if name not in funcdata: + funcdata[name] = [cur] + else: + funcdata[name].append(cur) + if cur not in datafunc: + datafunc[cur] = [name] + else: + datafunc[cur].append(name) + cur = NextHead(cur) + return funcdata, datafunc + +def obtainDataRefs(callgraph): + datarefs = {} + funcdata, datafunc = processDataSegs() + for node in callgraph: + if node in funcdata: + datas = funcdata[node] + for dd in datas: + refs = datafunc[dd] + refs = list(set(refs)) + if node in datarefs: + print refs + datarefs[node] += refs + datarefs[node] = list(set(datarefs[node])) + else: + datarefs[node] = refs + return datarefs + + diff --git a/raw-feature-extractor/func.py b/raw-feature-extractor/func.py index b68026b..cc0c402 100755 --- a/raw-feature-extractor/func.py +++ b/raw-feature-extractor/func.py @@ -13,7 +13,8 @@ import cfg_constructor as cfg import cPickle as pickle import pdb from raw_graphs import * -from discovRe_feature.discovRe import * +#from discovRe_feature.discovRe import * +from discovRe import * #import wingdbstub #wingdbstub.Ensure() def gt_funcNames(ea): diff --git a/raw-feature-extractor/graph_analysis_ida.py b/raw-feature-extractor/graph_analysis_ida.py index 3d82bf0..122efa9 100644 --- a/raw-feature-extractor/graph_analysis_ida.py +++ b/raw-feature-extractor/graph_analysis_ida.py @@ -2,23 +2,146 @@ from idautils import * from idaapi import * from idc import * -def getSequences(start, end): - seqs = [] +def getfunc_consts(func): + strings = [] + consts = [] + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + for bl in blocks: + strs, conts = getBBconsts(bl) + strings += strs + consts += conts + return strings, consts + +def getConst(ea, offset): + strings = [] + consts = [] + optype1 = GetOpType(ea, offset) + if optype1 == idaapi.o_imm: + imm_value = GetOperandValue(ea, offset) + if 0<= imm_value <= 10: + consts.append(imm_value) + else: + if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value): + str_value = GetString(imm_value) + if str_value is None: + str_value = GetString(imm_value+0x40000) + if str_value is None: + consts.append(imm_value) + else: + re = all(40 <= ord(c) < 128 for c in str_value) + if re: + strings.append(str_value) + else: + consts.append(imm_value) + else: + re = all(40 <= ord(c) < 128 for c in str_value) + if re: + strings.append(str_value) + else: + consts.append(imm_value) + else: + consts.append(imm_value) + return strings, consts + +def getBBconsts(bl): + strings = [] + consts = [] + start = bl[0] + end = bl[1] + invoke_num = 0 inst_addr = start while inst_addr < end: opcode = GetMnem(inst_addr) - if opcode == 'move' or opcode == "mov": - opnd1 = GetOpnd(inst_addr,0) - if opnd1 == '$v0' or opnd1 == "$eax": - opcode = (opcode, opnd1) - seqs.append(opcode) + if opcode in ['la','jalr','call', 'jal']: + inst_addr = NextHead(inst_addr) + continue + strings_src, consts_src = getConst(inst_addr, 0) + strings_dst, consts_dst = getConst(inst_addr, 1) + strings += strings_src + strings += strings_dst + consts += consts_src + consts += consts_dst + try: + strings_dst, consts_dst = getConst(inst_addr, 2) + consts += consts_dst + strings += strings_dst + except: + pass + inst_addr = NextHead(inst_addr) - return seqs + return strings, consts + +def getFuncCalls(func): + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + sumcalls = 0 + for bl in blocks: + callnum = calCalls(bl) + sumcalls += callnum + return sumcalls + +def getLogicInsts(func): + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + sumcalls = 0 + for bl in blocks: + callnum = calLogicInstructions(bl) + sumcalls += callnum + return sumcalls + +def getTransferInsts(func): + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + sumcalls = 0 + for bl in blocks: + callnum = calTransferIns(bl) + sumcalls += callnum + return sumcalls + +def getIntrs(func): + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + sumcalls = 0 + for bl in blocks: + callnum = calInsts(bl) + sumcalls += callnum + return sumcalls + +def getLocalVariables(func): + args_num = get_stackVariables(func.startEA) + return args_num + +def getBasicBlocks(func): + blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] + return len(blocks) + +def getIncommingCalls(func): + refs = CodeRefsTo(func.startEA, 0) + re = len([v for v in refs]) + return re + + +def get_stackVariables(func_addr): + #print func_addr + args = [] + stack = GetFrame(func_addr) + if not stack: + return 0 + firstM = GetFirstMember(stack) + lastM = GetLastMember(stack) + i = firstM + while i <=lastM: + mName = GetMemberName(stack,i) + mSize = GetMemberSize(stack,i) + if mSize: + i = i + mSize + else: + i = i+4 + if mName not in args and mName and 'var_' in mName: + args.append(mName) + return len(args) + + def calArithmeticIns(bl): x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1} mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1} - arm_AI = {"ADD":1, "ADC":1, "SUB":1, "SBC":1, "RSB":1, "RSC":1, "MUL":1, "MLA":1} calls = {} calls.update(x86_AI) calls.update(mips_AI) @@ -28,22 +151,20 @@ def calArithmeticIns(bl): inst_addr = start while inst_addr < end: opcode = GetMnem(inst_addr) - re = [v for v in calls if opcode in v] - if len(re) > 0: + if opcode in calls: invoke_num += 1 inst_addr = NextHead(inst_addr) return invoke_num def calCalls(bl): - calls = {'call':1, 'jal':1, 'jalr':1, "BL":1} + calls = {'call':1, 'jal':1, 'jalr':1} start = bl[0] end = bl[1] invoke_num = 0 inst_addr = start while inst_addr < end: opcode = GetMnem(inst_addr) - re = [v for v in calls if opcode in v] - if len(re) > 0: + if opcode in calls: invoke_num += 1 inst_addr = NextHead(inst_addr) return invoke_num @@ -61,19 +182,16 @@ def calInsts(bl): def calLogicInstructions(bl): x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1} mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1} - arm_LI = {"AND":1, "EOR":1, "ORR":1, "ORN":1, 'BIC':1} calls = {} calls.update(x86_LI) calls.update(mips_LI) - calls.update(arm_LI) start = bl[0] end = bl[1] invoke_num = 0 inst_addr = start while inst_addr < end: opcode = GetMnem(inst_addr) - re = [v for v in calls if opcode in v] - if len(re) > 0: + if opcode in calls: invoke_num += 1 inst_addr = NextHead(inst_addr) return invoke_num @@ -90,35 +208,19 @@ def calSconstants(bl): inst_addr = NextHead(inst_addr) return invoke_num -def getConst(ea, offset): - strings = [] - consts = [] - optype1 = GetOpType(ea, offset) - if optype1 == idaapi.o_imm: - imm_value = GetOperandValue(ea, offset) - if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value): - str_value = GetString(imm_value) - strings.append(str_value) - else: - consts.append(imm_value) - return strings, consts -def getBBconsts(bl): - strings = [] - consts = [] +def calNconstants(bl): start = bl[0] end = bl[1] invoke_num = 0 inst_addr = start while inst_addr < end: - strings_src, consts_src = getConst(inst_addr, 0) - strings_dst, consts_dst = getConst(inst_addr, 1) - strings += strings_src - strings += strings_dst - consts += consts_src - consts += consts_dst + optype1 = GetOpType(inst_addr, 0) + optype2 = GetOpType(inst_addr, 1) + if optype1 == 5 or optype2 == 5: + invoke_num += 1 inst_addr = NextHead(inst_addr) - return strings, consts + return invoke_num def retrieveExterns(bl, ea_externs): externs = [] @@ -142,7 +244,6 @@ def calTransferIns(bl): calls = {} calls.update(x86_TI) calls.update(mips_TI) - calls.update(arm_TI) start = bl[0] end = bl[1] invoke_num = 0