批量化操作

2024-03-01 16:11:26 +08:00 · 2024-03-01 16:11:26 +08:00 · 548eedb292
commit 548eedb292
parent 0f1e3378a2
14 changed files with 48 additions and 865 deletions
--- a/.idea/Gencoding3.iml
+++ b/.idea/Gencoding3.iml
@ -4,7 +4,7 @@
    <content url="file://$MODULE_DIR$">
      <sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="malgraph" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="malgraph" project-jdk-type="Python SDK" />
 </project>
--- a/Genius3/bat_file/benign/ida_file_cerate.bat
+++ b/Genius3/bat_file/benign/ida_file_cerate.bat
@ -2,7 +2,7 @@
 setlocal EnableDelayedExpansion
-set "FOLDER_PATH=D:\bishe\dataset\train_benign"
+set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"
--- a/Genius3/bat_file/malware/ida_file_cerate_malware.bat
+++ b/Genius3/bat_file/malware/ida_file_cerate_malware.bat
@ -0,0 +1,16 @@
@echo off
 setlocal EnableDelayedExpansion
 set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
 for %%f in ("%FOLDER_PATH%\*") do (
    echo !time! %%f
    D:\IDA_Pro_v6.8\idaq64.exe -c -A  -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
 )
 endlocal
--- a/Genius3/main.py
+++ b/Genius3/main.py
@ -1,16 +0,0 @@
 # -*- coding: UTF-8 -*-
 import sys
 from func import *
 from raw_graphs import *
 from idc import *
 import os
 import argparse
 if __name__ == '__main__':
 	print "hello"
 	#
 	# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
 	# -c 删除旧数据库  -A 自动分析，不显示对话框
 	# -B 相当于 -c -A
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
@ -1,81 +0,0 @@
 class HierarchicalGraphNeuralNetwork(nn.Module):
    def __init__(self, external_vocab: Vocab):
        super(HierarchicalGraphNeuralNetwork, self).__init__()
        self.pool = 'global_max_pool'
        # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
        cfg_filter_list =[200, 200]
        cfg_filter_list.insert(0, 11)
        self.cfg_filter_length = len(cfg_filter_list)
        cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
                                i in range(self.cfg_filter_length - 1)]
        cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
        cfg_constructor = cfg_conv['constructor']
        for i in range(self.cfg_filter_length - 1):
            setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
        self.dropout = nn.Dropout(p=0.2)
        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
        self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
                                                     embedding_dim=cfg_filter_list[-1],
                                                     padding_idx=external_vocab.pad_idx)
        fcg_filter_list = [200, 200]
        fcg_filter_list.insert(0, cfg_filter_list[-1])
        self.fcg_filter_length = len(fcg_filter_list)
        fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
                                i in range(self.fcg_filter_length - 1)]
        fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
        fcg_constructor = fcg_conv['constructor']
        for i in range(self.fcg_filter_length - 1):
            setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
        # Last Projection Function: gradually project with more linear layers
        self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
        self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
        self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
        self.last_activation = nn.Softmax(dim=1)
    def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
                bt_all_function_edges: list):
        rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
        x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
        fcg_list = []
        fcg_internal_list = []
        for idx_batch in range(len(real_bt_positions) - 1):
            start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
            idx_x_cfg = x_cfg_pool[start_pos: end_pos]
            fcg_internal_list.append(idx_x_cfg)
            idx_x_external = self.external_embedding_layer(
                torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
            idx_x_external = idx_x_external.squeeze(dim=0)
            idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
            idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
            idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
            idx_graph_data.validate()
            fcg_list.append(idx_graph_data)
        fcg_batch = Batch.from_data_list(fcg_list)
        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
        rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch)  # [batch_size, max_node_size, dim]
        x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
        batch_final = x_fcg_pool
        # step last project to the number_of_classes (multiclass)
        bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
        bt_pred = self.last_activation(bt_final_embed)
        return bt_pred
    def forward_cfg_gnn(self, local_batch: Batch):
        in_x, edge_index = local_batch.x, local_batch.edge_index
        for i in range(self.cfg_filter_length - 1):
            out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
            out_x = torch.nn.functional.relu(out_x, inplace=True)
            out_x = self.dropout(out_x)
            in_x = out_x
        local_batch.x = in_x
        return local_batch
    def forward_fcg_gnn(self, function_batch: Batch):
        in_x, edge_index = function_batch.x, function_batch.edge_index
        for i in range(self.fcg_filter_length - 1):
            out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
            out_x = torch.nn.functional.relu(out_x, inplace=True)
            out_x = self.dropout(out_x)
            in_x = out_x
        function_batch.x = in_x
        return function_batch
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -242,7 +242,5 @@ def convert_benign(overhaul):
 if __name__ == '__main__':
    # convert(35, 69)
    # convert_benign(True)
    convert_benign(True)
    convert_malware(True)
--- a/Genius3/raw-feature-extractor/discovRe.py
+++ b/Genius3/raw-feature-extractor/discovRe.py
@ -1,264 +0,0 @@
 # coding=utf-8
 #
 # Reference Lister
 #
 # List all functions and all references to them in the current section.
 #
 # Implemented with the idautils module
 #
 import networkx as nx
 import pdb
 from graph_analysis_ida import *
 from graph_property import *
 # import wingdbstub
 # wingdbstub.Ensure()
 def get_funcs(ea):
    funcs = {}
    # Get current ea
    # Loop from start to end in the current segment
    for funcea in Functions(SegStart(ea)):
        funcname = GetFunctionName(funcea)
        func = get_func(funcea)
        blocks = FlowChart(func)
        funcs[funcname] = []
        for bl in blocks:
            start = bl.startEA
            end = bl.endEA
            funcs[funcname].append((start, end))
    return funcs
 # 似乎是没用的函数
 # def get_funcs_for_discoverRe(ea):
 #     features = {}
 #     for funcea in Functions(SegStart(ea)):
 #         funcname = GetFunctionName(funcea)
 #         print(funcname)
 #         func = get_func(funcea)
 #         feature = get_discoverRe_feature(func)
 #         features[funcname] = feature
 #     return features
 # 获取所有bb的11维属性特征
 # 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
 def get_bb_features(func):
    bb_features = []
    blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
    for bl in blocks:
        calls = calCalls(bl)
        transferIns = calTransferIns(bl)
        mathematicsIns = calArithmeticIns(bl)
        logicIns = calLogicInstructions(bl)
        cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
        movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
        interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
        declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
        totalIns = calInsts(bl)
        consts = getBBconsts(bl)
        stringOrIntConsts = len(consts[0]) + len(consts[1])
        bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
                            interruptIns, declareIns, totalIns, stringOrIntConsts])
    return bb_features
 def get_discoverRe_feature(func, icfg):
    start = func.startEA
    end = func.endEA
    features = []
    FunctionCalls = getFuncCalls(func)
    # 1
    features.append(FunctionCalls)
    LogicInstr = getLogicInsts(func)
    # 2
    features.append(LogicInstr)
    Transfer = getTransferInsts(func)
    # 3
    features.append(Transfer)
    Locals = getLocalVariables(func)
    # 4
    features.append(Locals)
    BB = getBasicBlocks(func)
    # 5
    features.append(BB)
    Edges = len(icfg.edges())
    # 6
    features.append(Edges)
    Incoming = getIncommingCalls(func)
    # 7
    features.append(Incoming)
    # 8
    Instrs = getIntrs(func)
    features.append(Instrs)
    between = retrieveGP(icfg)
    # 9
    features.append(between)
    strings, consts = getfunc_consts(func)
    # 10
    features.append(strings)
    # 11
    features.append(consts)
    return features
 def get_func_names(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
        funcname = GetFunctionName(funcea)
        funcs[funcname] = funcea
    return funcs
 def get_func_bases(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
        funcname = GetFunctionName(funcea)
        funcs[funcea] = funcname
    return funcs
 def get_func_range(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
        funcname = GetFunctionName(funcea)
        func = get_func(funcea)
        funcs[funcname] = (func.startEA, func.endEA)
    return funcs
 def get_func_sequences(ea):
    funcs_bodylist = {}
    funcs = get_funcs(ea)
    for funcname in funcs:
        if funcname not in funcs_bodylist:
            funcs_bodylist[funcname] = []
        for start, end in funcs[funcname]:
            inst_addr = start
            while inst_addr <= end:
                opcode = GetMnem(inst_addr)
                funcs_bodylist[funcname].append(opcode)
                inst_addr = NextHead(inst_addr)
    return funcs_bodylist
 def get_func_cfgs(ea):
    func_cfglist = {}
    i = 0
    start, end = get_section('LOAD')
    # print start, end
    for funcea in Functions(SegStart(ea)):
        if start <= funcea <= end:
            funcname = GetFunctionName(funcea)
            func = get_func(funcea)
            print(i)
            i += 1
            try:
                icfg = cfg.cfg_construct(func)
                func_cfglist[funcname] = icfg
            except:
                pass
    return func_cfglist
 def get_section(t):
    base = SegByName(t)
    start = SegByBase(base)
    end = SegEnd(start)
    return start, end
 def get_func_cfg_sequences(func_cfglist):
    func_cfg_seqlist = {}
    for funcname in func_cfglist:
        func_cfg_seqlist[funcname] = {}
        cfg = func_cfglist[funcname][0]
        for start, end in cfg:
            codesq = get_sequences(start, end)
            func_cfg_seqlist[funcname][(start, end)] = codesq
    return func_cfg_seqlist
 def get_sequences(start, end):
    seq = []
    inst_addr = start
    while inst_addr <= end:
        opcode = GetMnem(inst_addr)
        seq.append(opcode)
        inst_addr = NextHead(inst_addr)
    return seq
 def get_stack_arg(func_addr):
    print(func_addr)
    args = []
    stack = GetFrame(func_addr)
    if not stack:
        return []
    firstM = GetFirstMember(stack)
    lastM = GetLastMember(stack)
    i = firstM
    while i <= lastM:
        mName = GetMemberName(stack, i)
        mSize = GetMemberSize(stack, i)
        if mSize:
            i = i + mSize
        else:
            i = i + 4
        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
            args.append(mName)
    return args
    # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
 def processDataSegs():
    funcdata = {}
    datafunc = {}
    for n in xrange(idaapi.get_segm_qty()):
        seg = idaapi.getnseg(n)
        ea = seg.startEA
        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
        if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
            start = idc.SegStart(ea)
            end = idc.SegEnd(ea)
            cur = start
            while cur <= end:
                refs = [v for v in DataRefsTo(cur)]
                for fea in refs:
                    name = GetFunctionName(fea)
                    if len(name) == 0:
                        continue
                    if name not in funcdata:
                        funcdata[name] = [cur]
                    else:
                        funcdata[name].append(cur)
                    if cur not in datafunc:
                        datafunc[cur] = [name]
                    else:
                        datafunc[cur].append(name)
                cur = NextHead(cur)
    return funcdata, datafunc
 def obtainDataRefs(callgraph):
    datarefs = {}
    funcdata, datafunc = processDataSegs()
    for node in callgraph:
        if node in funcdata:
            datas = funcdata[node]
            for dd in datas:
                refs = datafunc[dd]
                refs = list(set(refs))
                if node in datarefs:
                    print(refs)
                    datarefs[node] += refs
                    datarefs[node] = list(set(datarefs[node]))
                else:
                    datarefs[node] = refs
    return datarefs
--- a/Genius3/raw-feature-extractor/func.py
+++ b/Genius3/raw-feature-extractor/func.py
@ -16,9 +16,7 @@ from raw_graphs import *
 #from discovRe_feature.discovRe import *
 from discovRe import *
-sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
+
 #import wingdbstub
 #wingdbstub.Ensure()
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.py
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py
@ -119,24 +119,23 @@ def getIncommingCalls(func):
 def get_stackVariables(func_addr):
-    #print func_addr
+	args = []
-    args = []
+	stack = GetFrame(func_addr)
-    stack = GetFrame(func_addr)
+	if not stack:
-    if not stack:
+		return 0
-            return 0
+	firstM = GetFirstMember(stack)
-    firstM = GetFirstMember(stack)
+	lastM = GetLastMember(stack)
-    lastM = GetLastMember(stack)
+	i = firstM
-    i = firstM
+	while i <= lastM:
-    while i <=lastM:
+		mName = GetMemberName(stack, i)
-        mName = GetMemberName(stack,i)
+		mSize = GetMemberSize(stack, i)
-        mSize = GetMemberSize(stack,i)
+		if mSize:
-        if mSize:
+			i = i + mSize
-                i = i + mSize
+		else:
-        else:
+			i = i + 4
-                i = i+4
+		if mName not in args and mName and 'var_' in mName:
-        if mName not in args and mName and 'var_' in mName:
+			args.append(mName)
-            args.append(mName)
+	return len(args)
    return len(args)
 # 计算算数指令数量
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -1,7 +1,7 @@
 # coding=utf-8
 import os
 import pickle
-import idc
+from func import *
 from idc import *
 import idaapi
 # 定义常量
@ -12,6 +12,7 @@ CFG_EXTENSION = ".ida"
 GDL_EXTENSION = ".dot"
 ASM_EXTENSION = ".asm"
 def preprocess(binary_name, workflow):
    cfg_path = os.path.join(
        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
@ -29,9 +30,9 @@ def preprocess(binary_name, workflow):
    if os.path.exists(cfg_path):
        idc.Exit(0)
    else:
-        analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
+        analysis_flags = idc.GetShortPrm(idc.INF_AF2)
-        analysis_flags &= ~idc.AF_IMMOFF
+        analysis_flags &= ~ida_ida.AF_IMMOFF
-        idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
+        idc.SetShortPrm(idc.INF_AF2, analysis_flags)
        idaapi.autoWait()
@ -47,17 +48,21 @@ def preprocess(binary_name, workflow):
        # 关闭IDA Pro
        idc.Exit(0)
 def generate_cfg(binary_name, cfg_path):
    cfgs = get_func_cfgs_c(FirstSeg())
    with open(cfg_path, 'wb') as cfg_file:
        pickle.dump(cfgs, cfg_file)
 def generate_gdl(gdl_path):
    idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
 def generate_asm(asm_path):
    idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
 # 主函数
 def main():
    binary_name = idc.GetInputFile()
@ -68,6 +73,7 @@ def main():
        return
    preprocess(binary_name, workflow)
 # 如果是作为IDA Pro的脚本运行，调用主函数
 if __name__ == "__main__":
    main()
--- a/Genius3/raw-feature-extractor/read_idaFILE.py
+++ b/Genius3/raw-feature-extractor/read_idaFILE.py
@ -1,101 +0,0 @@
 # -*- coding: UTF-8 -*-
 import sys
 from matplotlib import pyplot as plt
 import networkx as nx
 import pickle
 # sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
 # sys.path.insert(1, 'C:/Python27/Lib/site-packages')
 def print_obj(obj):
    # "打印对象的所有属性"
    print(obj.__dict__)
 # sub_10F20 308  反编译代码有字符串，但是这个特征提取里没有字符串 constant，可能是间接引用的，不识别。看了下所有函数的特征，几乎都没有字符串常量，可能都是写在别的地方然后引用的。
 # sub_166C4 393
 if __name__ == '__main__':
    testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
    fr = open(testpath, 'r')
    data = pickle.load(fr) #一个二进制文件的acfgs
    fr.close()
    # print(type(data1))
    # print_obj(data1)
    # print data1.raw_graph_list[393]
    # print_obj(data1.raw_graph_list[393])
    # nx.draw(data1.raw_graph_list[393].g,with_labels=True)
    # plt.show()
    print("一个二进制文件的所有函数的原始特征，list。")
    print_obj(data)  # acfg list
    print("\n")
    print("一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成")
    print_obj(data.raw_graph_list[0])  # 一个函数的acfg
    print("其中fun_features = 函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
    # feature = data.raw_graph_list[0].fun_features
    print("old_g:{}".format(data.raw_graph_list[0].old_g))
    print("g:{}".format(data.raw_graph_list[0].g))
    # G = data1.raw_graph_list[393].old_g
    # print G.node[0] # G.node[i]是dict
    # for key, value in G.node[0].items():
    #     print('{key}:{value}'.format(key=key, value=value))
    # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
    G = data.raw_graph_list[0].g
    print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
    # print(G.node[0])
    # print("\n")
    # 函数内所有基本快的特征
    for key, value in G.node.items():
        print('{}:{}'.format(key, value))
    #oldg就是读取IDA的CFG，所以数量、方向等都一样；g根据old_g生成，也一样
    #old g
    G = data.raw_graph_list[0].old_g
    nx.draw(G, with_labels=True)
    #plt.title('old_g')
    plt.show()
    # g
    G = data.raw_graph_list[0].g
    nx.draw(G, with_labels=True)
    #plt.title('Genius_g')
    plt.show()
    # draw graph with labels
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    node_labels = nx.get_node_attributes(G, 'v')  #networkx的node，由属性。g的属性为'v'，意为原始特征的vector。old_g的属性见cfg_constructor.py
    nx.draw_networkx_labels(G, pos, labels=node_labels)
    #plt.title('Genius_g with raw feature vector')
    plt.show()
 # 1 function calls（本函数的函数调用指令（call jal jalr）数量）。。注意arm中没有这些指令
 # 2 logic instructions ，本函数的逻辑运算指令数量。如and、or的数量
 # 3 TransferIns 转移指令（如jmp arm中为mov）数量
 # 4 LocalVariables 局部变量数量
 # 5 BB basicblocks数量
 # 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征，这里暂时不管
 # 7 IncommingCalls，调用本函数的指令数量
 # 8 Intrs 指令数量
 # 9 between 结构特征中的betweeness。
 # 10 strings 字符串
 # 11 consts  数字常量
--- a/Genius3/search-engine/db.py
+++ b/Genius3/search-engine/db.py
@ -1,356 +0,0 @@
 import cPickle as pickle 
 from search import *
 from nearpy import Engine
 from nearpy.hashes import RandomDiscretizedProjections
 from nearpy.filters import NearestFilter, UniqueFilter
 from nearpy.distances import EuclideanDistance
 from nearpy.distances import CosineDistance
 from nearpy.hashes import RandomBinaryProjections
 from nearpy.experiments import DistanceRatioExperiment
 from redis import Redis
 from nearpy.storage import RedisStorage
 from feature import *
 import numpy as np
 import os
 import pdb
 import argparse
 import time
 import numpy as np
 from refactoring import *
 import pymongo
 from pymongo import MongoClient
 def initDB():
 	client = MongoClient()
 	client = MongoClient('localhost', 27017)
 	client = MongoClient('mongodb://localhost:27017/')
 	db = client.test_database
 	db = client['iot-encoding']
 	return db
 db = initDB()
 posts = db.posts
 class db:
 	def __init__(self):
 		self.feature_list = {}
 		self.engine = None
 	def loadHashmap(self, feature_size, result_n):
 		# Create redis storage adapter
 		redis_object = Redis(host='localhost', port=6379, db=0)
 		redis_storage = RedisStorage(redis_object)
 		pdb.set_trace()
 		try:
 			# Get hash config from redis
 			config = redis_storage.load_hash_configuration('test')
 			# Config is existing, create hash with None parameters
 			lshash = RandomBinaryProjections(None, None)
 			# Apply configuration loaded from redis
 			lshash.apply_config(config)
 		except:
 			# Config is not existing, create hash from scratch, with 10 projections
 			lshash = RandomBinaryProjections('test', 0)
 		# Create engine for feature space of 100 dimensions and use our hash.
 		# This will set the dimension of the lshash only the first time, not when
 		# using the configuration loaded from redis. Use redis storage to store
 		# buckets.
 		nearest = NearestFilter(1000)
 		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
 		pdb.set_trace()
 		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
 		# Do some stuff like indexing or querying with the engine...
 		# Finally store hash configuration in redis for later use
 		redis_storage.store_hash_configuration(lshash)
 	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
 		if fvector is None:
 			return
 		#ftuple = tuple([fvector])
 		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
 	def batch_appendDB(self, binary_name, features, firmware_name=""):
 		for funcname in features:
 			feature = features[funcname]
 			#pdb.set_trace()
 			self.appendToDB(binary_name, funcname, feature, firmware_name)
 	def batch_appendDBbyDir(self, base_dir):
 		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
 		i = 0
 		for v in cursor:
 			print i
 			i+=1
 			binary_name = v['binary_name']
 			funcname = v['func_name']
 			firmware_name = v['firmware_name']
 			feature = v['fvector']
 			self.appendToDB(binary_name, funcname, feature, firmware_name)
 	def batch_appendDBbyDir1(self, base_dir):
 		image_dir = os.path.join(base_dir, "image")
 		firmware_featrues={}
 		bnum = 0
 		fnum = 0
 		i  = 0
 		pdb.set_trace()
 		for firmware_name in os.listdir(image_dir):
 			print firmware_name
 			firmware_featrues[firmware_name] = {}
 			firmware_dir = os.path.join(image_dir, firmware_name)
 			for binary_name in os.listdir(firmware_dir):
 				if binary_name.endswith(".features"):
 					bnum += 1
 					featrues_dir = os.path.join(firmware_dir, binary_name)
 					featrues = pickle.load(open(featrues_dir, "r"))
 					for funcname in featrues:
 						fnum +=1
 						#pdb.set_trace()
 						feature = featrues[funcname]
 						self.appendToDB(binary_name, funcname, feature, firmware_name)
 					del featrues
 		print("bnum ", bnum)
 		print("fnum ", fnum)
 	def dump(self, base_dir):
 		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
 		pickle.dump(self.feature_list, open(db_dir, 'w'))
 		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
 		pickle.dump(self.engine, open(db_dir, 'w'))
 	def loadDB(self, base_dir):
 		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
 		self.feature_list = pickle.load(open(db_dir, 'r'))
 		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
 		self.engine = pickle.load(open(db_dir, 'r'))
 	def findF(self, binary_name, funcname):
 		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
 		return x[0]
 def retrieveFeaturesByDir(n, base_dir):
 	firmware_featrues={}
 	i = 0
 	for firmware_name in os.listdir(base_dir):
 		if firmware_name.endWith(".features"):
 			firmware_featrues[firmware_name] = {}
 			firmware_dir = os.path.join(base_dir, firmware_name)
 			if i > 0:
 				break
 			i += 1
 			pdb.set_trace()
 			for binary_name in os.listdir(firmware_dir):
 				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
 				featrues = pickle.load(open(featrues_dir, "r"))
 				for funcname in featrues:
 					feature = featrues[funcname]
 					self.appendToDB(firmware_name, binary_name, funcname, feature)
 				del featrues
 def retrieveFeatures(n, base_dir, filename, funcs):
 	feature_dic = {}
 	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	#featuresx = retrieveFeaturesx(filename)
 	for name in featrues:
 		#if name in funcs:
 		x = featrues[name] 
 		#+ featuresx[name]
 		feature_dic[name] = np.asarray(x)
 	return feature_dic
 def retrieveVuldb(base_input_dir):
 	vul_path = os.path.join(base_input_dir, "vul")
 	vul_db = pickle.load(open(vul_path, "r"))
 	return vul_db
 def retrieveFeaturesx(filename):
 	ida_input_dir = os.path.join("./data/", filename + ".features")
 	featuresx = pickle.load(open(ida_input_dir, "r"))
 	return featuresx
 def retrieveQueries(n, base_dir, filename1, featrues_src):
 	queries = {}
 	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	#featuresx = retrieveFeaturesx(filename1)
 	for name in featrues:
 		#if name in featrues_src:
 		x = featrues[name] 
 		#+ featuresx[name]
 		queries[name] = np.asarray(x)
 	return queries
 def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
 	queries = {}
 	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	for name in featrues:
 		#del featrues[name][5]
 		queries[name] = np.asarray(featrues[name])
 	return queries
 def retrieveQuery(n, base_dir, filename, funcname):
 	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
 	featrues = pickle.load(open(featrues_dir, "r"))
 	f = [featrues[v] for v in featrues if funcname in v ][0]
 	return np.asarray(f)
 def parse_command():
 	parser = argparse.ArgumentParser(description='Process some integers.')
 	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
 	parser.add_argument('--output_dir', type=str, help="output dir")
 	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
 	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
 	parser.add_argument("--size", type=int, help="the size of each graphlet")
 	#parser.add_argument("--size", type=int, help="the size of each graphlet")
 	args = parser.parse_args()
 	return args
 def loadFuncs(path):
 	funcs = {}
 	x86_dir = os.path.join(path, "func_candid")
 	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
 	fp = open(x86_dir,"r")
 	for line in fp:
 		items = line.split("\n")
 		funcname = items[0]
 		funcs[funcname] = 1
 	return funcs
 def dump(path, featrues, queries):
 	fp = open(path + "/" + "matrix", 'w')
 	for name in featrues:
 		row = []
 		row.append("x86")
 		row.append(name)
 		row += featrues[name]
 		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
 	for name in queries:
 		row = []
 		row.append("mips")
 		row.append(name)
 		row += queries[name]
 		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
 	fp.close()
 def queryBytwo(base_input_dir, filename1, filename2, n):
 	threthold = 50
 	db_instance = db()
 	funcs = loadFuncs(base_input_dir)
 	db_instance.loadHashmap(n, 50000)
 	#pdb.set_trace()
 	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
 	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
 	#queries = refactoring(queries, featrues)
 	vul_db = retrieveVuldb(base_input_dir)
 	pdb.set_trace()
 	#dump(base_input_dir, featrues, queries)
 	#start = time.time()
 	#db_instance.batch_appendDBbyDir(base_input_dir)
 	#end = time.time()
 	#total = end - start
 	#print total
 	db_instance.batch_appendDB(filename1, featrues)
 	pdb.set_trace()
 	ranks = []
 	times = []
 	for threthold in xrange(1, 210, 10):
 		hit = []
 		i = 0
 		for name in queries:
 			#print i 
 			i += 1
 			'''
 			if i == 1000:
 				print (sum(times)/len(times))
 				pdb.set_trace()
 				print "s"
 			'''
 			#if name not in vul_db['openssl']:
 			#	continue
 			if name not in featrues:
 				continue
 			#pdb.set_trace()
 			query = queries[name]
 			#start = time.time()
 			x = db_instance.engine.neighbours(query)
 			#end = time.time()
 			#total = end - start
 			#times.append(total)
 			#print total
 			#pdb.set_trace()
 			try:
 				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
 				ranks.append((name, rank))
 				if rank <= threthold:
 					hit.append(1)
 				else:
 					hit.append(0)
 			except:
 				#pdb.set_trace()
 				hit.append(0)
 				pass
 		#pdb.set_trace()
 		acc = sum(hit) * 1.0 / len(hit)
 		print acc
 def queryAll(base_dir, firmware_name, filename1, n):
 	threthold = 155
 	db_instance = db()
 	db_instance.loadHashmap(n, 50000)
 	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
 	start = time.time()
 	pdb.set_trace()
 	db_instance.batch_appendDBbyDir(n, base_dir)
 	end = time.time()
 	dur = end - start
 	print dur
 	pdb.set_trace()
 	hit = []
 	i = 0
 	times = []
 	for name in queries:
 		print i 
 		i += 1
 		query = queries[name]
 		start = time.clock()
 		x = db_instance.engine.neighbours(query)
 		end = time.clock()
 		dur = end - start
 		times.append(dur)
 		#pdb.set_trace()
 		try:
 			rank = [v for v in xrange(len(x)) if name in x[v][1]]
 			if len(rank) > 1:
 				pdb.set_trace()
 				print "stop"
 			if rank[0] <= threthold:
 				hit.append(1)
 			else:
 				hit.append(0)
 		except:
 			hit.append(0)
 	acc = sum(hit) * 1.0 / len(hit)
 	mean = np.mean(times)
 	std =  np.std(times)
 	#pdb.set_trace()
 	print acc
 if __name__ == "__main__":
 	args = parse_command()
 	base_dir = args.base_input_dir
 	filename1 = args.filename1
 	filename2 = args.filename2
 	n = args.size
 	pdb.set_trace()
 	queryBytwo(base_dir, filename1, filename2, n)
--- a/ida_file_cerate_malware.bat
+++ b/ida_file_cerate_malware.bat
@ -1,16 +0,0 @@
@echo off
 setlocal EnableDelayedExpansion
 set "FOLDER_PATH=D:\bishe\dataset\train_malware"
 for %%f in ("%FOLDER_PATH%\*") do (
    echo !time! %%f
    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
 )
 endlocal
`@ -2,7 +2,7 @@`
	`setlocal EnableDelayedExpansion`	`setlocal EnableDelayedExpansion`


	`set "FOLDER_PATH=D:\bishe\dataset\train_benign"`	`set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"`