diff --git a/.idea/Gencoding3.iml b/.idea/Gencoding3.iml index 7805102..f7a47fa 100644 --- a/.idea/Gencoding3.iml +++ b/.idea/Gencoding3.iml @@ -4,7 +4,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 7ba73c2..b20e505 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/ida_file_cerate.bat b/Genius3/bat_file/benign/ida_file_cerate.bat similarity index 82% rename from ida_file_cerate.bat rename to Genius3/bat_file/benign/ida_file_cerate.bat index f86dbde..618ba13 100644 --- a/ida_file_cerate.bat +++ b/Genius3/bat_file/benign/ida_file_cerate.bat @@ -2,7 +2,7 @@ setlocal EnableDelayedExpansion -set "FOLDER_PATH=D:\bishe\dataset\train_benign" +set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0" diff --git a/Genius3/bat_file/malware/ida_file_cerate_malware.bat b/Genius3/bat_file/malware/ida_file_cerate_malware.bat new file mode 100644 index 0000000..c061ccf --- /dev/null +++ b/Genius3/bat_file/malware/ida_file_cerate_malware.bat @@ -0,0 +1,16 @@ +@echo off +setlocal EnableDelayedExpansion + + +set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458" + + + +for %%f in ("%FOLDER_PATH%\*") do ( + echo !time! %%f + D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f + +) + +endlocal + diff --git a/Genius3/main.py b/Genius3/main.py deleted file mode 100644 index 266873d..0000000 --- a/Genius3/main.py +++ /dev/null @@ -1,16 +0,0 @@ -# -*- coding: UTF-8 -*- -import sys - -from func import * -from raw_graphs import * -from idc import * -import os -import argparse -if __name__ == '__main__': - print "hello" - - # - # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter - # -c 删除旧数据库 -A 自动分析,不显示对话框 - # -B 相当于 -c -A - diff --git a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py deleted file mode 100644 index b28ad47..0000000 --- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py +++ /dev/null @@ -1,81 +0,0 @@ -class HierarchicalGraphNeuralNetwork(nn.Module): - def __init__(self, external_vocab: Vocab): - super(HierarchicalGraphNeuralNetwork, self).__init__() - self.pool = 'global_max_pool' - # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling - cfg_filter_list =[200, 200] - cfg_filter_list.insert(0, 11) - self.cfg_filter_length = len(cfg_filter_list) - cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for - i in range(self.cfg_filter_length - 1)] - cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params) - cfg_constructor = cfg_conv['constructor'] - for i in range(self.cfg_filter_length - 1): - setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i])) - self.dropout = nn.Dropout(p=0.2) - # Hierarchical 2: Function Call Graph (FCG) embedding and pooling - self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2, - embedding_dim=cfg_filter_list[-1], - padding_idx=external_vocab.pad_idx) - fcg_filter_list = [200, 200] - fcg_filter_list.insert(0, cfg_filter_list[-1]) - self.fcg_filter_length = len(fcg_filter_list) - fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for - i in range(self.fcg_filter_length - 1)] - fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params) - fcg_constructor = fcg_conv['constructor'] - for i in range(self.fcg_filter_length - 1): - setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i])) - # Last Projection Function: gradually project with more linear layers - self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2)) - self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4)) - self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6) - self.last_activation = nn.Softmax(dim=1) - - def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list, - bt_all_function_edges: list): - rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch) - x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch) - fcg_list = [] - fcg_internal_list = [] - for idx_batch in range(len(real_bt_positions) - 1): - start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2] - idx_x_cfg = x_cfg_pool[start_pos: end_pos] - fcg_internal_list.append(idx_x_cfg) - idx_x_external = self.external_embedding_layer( - torch.tensor([bt_external_names[idx_batch]], dtype=torch.long)) - idx_x_external = idx_x_external.squeeze(dim=0) - idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0) - idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long) - idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge) - idx_graph_data.validate() - fcg_list.append(idx_graph_data) - fcg_batch = Batch.from_data_list(fcg_list) - # Hierarchical 2: Function Call Graph (FCG) embedding and pooling - rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim] - x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch) - batch_final = x_fcg_pool - # step last project to the number_of_classes (multiclass) - bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final))) - bt_pred = self.last_activation(bt_final_embed) - return bt_pred - - def forward_cfg_gnn(self, local_batch: Batch): - in_x, edge_index = local_batch.x, local_batch.edge_index - for i in range(self.cfg_filter_length - 1): - out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index) - out_x = torch.nn.functional.relu(out_x, inplace=True) - out_x = self.dropout(out_x) - in_x = out_x - local_batch.x = in_x - return local_batch - - def forward_fcg_gnn(self, function_batch: Batch): - in_x, edge_index = function_batch.x, function_batch.edge_index - for i in range(self.fcg_filter_length - 1): - out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index) - out_x = torch.nn.functional.relu(out_x, inplace=True) - out_x = self.dropout(out_x) - in_x = out_x - function_batch.x = in_x - return function_batch \ No newline at end of file diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index 7807f52..5b291ef 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -242,7 +242,5 @@ def convert_benign(overhaul): if __name__ == '__main__': - # convert(35, 69) - # convert_benign(True) convert_benign(True) convert_malware(True) diff --git a/Genius3/raw-feature-extractor/discovRe.py b/Genius3/raw-feature-extractor/discovRe.py deleted file mode 100644 index 451999e..0000000 --- a/Genius3/raw-feature-extractor/discovRe.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding=utf-8 -# -# Reference Lister -# -# List all functions and all references to them in the current section. -# -# Implemented with the idautils module -# -import networkx as nx -import pdb -from graph_analysis_ida import * -from graph_property import * - - -# import wingdbstub -# wingdbstub.Ensure() - -def get_funcs(ea): - funcs = {} - # Get current ea - # Loop from start to end in the current segment - for funcea in Functions(SegStart(ea)): - funcname = GetFunctionName(funcea) - func = get_func(funcea) - blocks = FlowChart(func) - funcs[funcname] = [] - for bl in blocks: - start = bl.startEA - end = bl.endEA - funcs[funcname].append((start, end)) - return funcs - - -# 似乎是没用的函数 -# def get_funcs_for_discoverRe(ea): -# features = {} -# for funcea in Functions(SegStart(ea)): -# funcname = GetFunctionName(funcea) -# print(funcname) -# func = get_func(funcea) -# feature = get_discoverRe_feature(func) -# features[funcname] = feature -# return features - - -# 获取所有bb的11维属性特征 -# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量 -def get_bb_features(func): - bb_features = [] - blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] - for bl in blocks: - calls = calCalls(bl) - transferIns = calTransferIns(bl) - mathematicsIns = calArithmeticIns(bl) - logicIns = calLogicInstructions(bl) - cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1}) - movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1}) - interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1}) - declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1}) - totalIns = calInsts(bl) - consts = getBBconsts(bl) - stringOrIntConsts = len(consts[0]) + len(consts[1]) - bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns, - interruptIns, declareIns, totalIns, stringOrIntConsts]) - return bb_features - - -def get_discoverRe_feature(func, icfg): - start = func.startEA - end = func.endEA - features = [] - FunctionCalls = getFuncCalls(func) - # 1 - features.append(FunctionCalls) - LogicInstr = getLogicInsts(func) - # 2 - features.append(LogicInstr) - Transfer = getTransferInsts(func) - # 3 - features.append(Transfer) - Locals = getLocalVariables(func) - # 4 - features.append(Locals) - BB = getBasicBlocks(func) - # 5 - features.append(BB) - Edges = len(icfg.edges()) - # 6 - features.append(Edges) - Incoming = getIncommingCalls(func) - # 7 - features.append(Incoming) - # 8 - Instrs = getIntrs(func) - features.append(Instrs) - between = retrieveGP(icfg) - # 9 - features.append(between) - - strings, consts = getfunc_consts(func) - # 10 - features.append(strings) - # 11 - features.append(consts) - return features - - -def get_func_names(ea): - funcs = {} - for funcea in Functions(SegStart(ea)): - funcname = GetFunctionName(funcea) - funcs[funcname] = funcea - return funcs - - -def get_func_bases(ea): - funcs = {} - for funcea in Functions(SegStart(ea)): - funcname = GetFunctionName(funcea) - funcs[funcea] = funcname - return funcs - - -def get_func_range(ea): - funcs = {} - for funcea in Functions(SegStart(ea)): - funcname = GetFunctionName(funcea) - func = get_func(funcea) - funcs[funcname] = (func.startEA, func.endEA) - return funcs - - -def get_func_sequences(ea): - funcs_bodylist = {} - funcs = get_funcs(ea) - for funcname in funcs: - if funcname not in funcs_bodylist: - funcs_bodylist[funcname] = [] - for start, end in funcs[funcname]: - inst_addr = start - while inst_addr <= end: - opcode = GetMnem(inst_addr) - funcs_bodylist[funcname].append(opcode) - inst_addr = NextHead(inst_addr) - return funcs_bodylist - - -def get_func_cfgs(ea): - func_cfglist = {} - i = 0 - start, end = get_section('LOAD') - # print start, end - for funcea in Functions(SegStart(ea)): - if start <= funcea <= end: - funcname = GetFunctionName(funcea) - func = get_func(funcea) - print(i) - i += 1 - try: - icfg = cfg.cfg_construct(func) - func_cfglist[funcname] = icfg - except: - pass - - return func_cfglist - - -def get_section(t): - base = SegByName(t) - start = SegByBase(base) - end = SegEnd(start) - return start, end - - -def get_func_cfg_sequences(func_cfglist): - func_cfg_seqlist = {} - for funcname in func_cfglist: - func_cfg_seqlist[funcname] = {} - cfg = func_cfglist[funcname][0] - for start, end in cfg: - codesq = get_sequences(start, end) - func_cfg_seqlist[funcname][(start, end)] = codesq - - return func_cfg_seqlist - - -def get_sequences(start, end): - seq = [] - inst_addr = start - while inst_addr <= end: - opcode = GetMnem(inst_addr) - seq.append(opcode) - inst_addr = NextHead(inst_addr) - return seq - - -def get_stack_arg(func_addr): - print(func_addr) - args = [] - stack = GetFrame(func_addr) - if not stack: - return [] - firstM = GetFirstMember(stack) - lastM = GetLastMember(stack) - i = firstM - while i <= lastM: - mName = GetMemberName(stack, i) - mSize = GetMemberSize(stack, i) - if mSize: - i = i + mSize - else: - i = i + 4 - if mName not in args and mName and ' s' not in mName and ' r' not in mName: - args.append(mName) - return args - - # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) - - -def processDataSegs(): - funcdata = {} - datafunc = {} - for n in xrange(idaapi.get_segm_qty()): - seg = idaapi.getnseg(n) - ea = seg.startEA - segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) - if segtype in [idc.SEG_DATA, idc.SEG_BSS]: - start = idc.SegStart(ea) - end = idc.SegEnd(ea) - cur = start - while cur <= end: - refs = [v for v in DataRefsTo(cur)] - for fea in refs: - name = GetFunctionName(fea) - if len(name) == 0: - continue - if name not in funcdata: - funcdata[name] = [cur] - else: - funcdata[name].append(cur) - if cur not in datafunc: - datafunc[cur] = [name] - else: - datafunc[cur].append(name) - cur = NextHead(cur) - return funcdata, datafunc - - -def obtainDataRefs(callgraph): - datarefs = {} - funcdata, datafunc = processDataSegs() - for node in callgraph: - if node in funcdata: - datas = funcdata[node] - for dd in datas: - refs = datafunc[dd] - refs = list(set(refs)) - if node in datarefs: - print(refs) - datarefs[node] += refs - datarefs[node] = list(set(datarefs[node])) - else: - datarefs[node] = refs - return datarefs diff --git a/Genius3/raw-feature-extractor/func.py b/Genius3/raw-feature-extractor/func.py index 33020aa..61f207a 100644 --- a/Genius3/raw-feature-extractor/func.py +++ b/Genius3/raw-feature-extractor/func.py @@ -16,9 +16,7 @@ from raw_graphs import * #from discovRe_feature.discovRe import * from discovRe import * -sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python") -#import wingdbstub -#wingdbstub.Ensure() + diff --git a/Genius3/raw-feature-extractor/graph_analysis_ida.py b/Genius3/raw-feature-extractor/graph_analysis_ida.py index 390f8f1..66194b3 100644 --- a/Genius3/raw-feature-extractor/graph_analysis_ida.py +++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py @@ -119,24 +119,23 @@ def getIncommingCalls(func): def get_stackVariables(func_addr): - #print func_addr - args = [] - stack = GetFrame(func_addr) - if not stack: - return 0 - firstM = GetFirstMember(stack) - lastM = GetLastMember(stack) - i = firstM - while i <=lastM: - mName = GetMemberName(stack,i) - mSize = GetMemberSize(stack,i) - if mSize: - i = i + mSize - else: - i = i+4 - if mName not in args and mName and 'var_' in mName: - args.append(mName) - return len(args) + args = [] + stack = GetFrame(func_addr) + if not stack: + return 0 + firstM = GetFirstMember(stack) + lastM = GetLastMember(stack) + i = firstM + while i <= lastM: + mName = GetMemberName(stack, i) + mSize = GetMemberSize(stack, i) + if mSize: + i = i + mSize + else: + i = i + 4 + if mName not in args and mName and 'var_' in mName: + args.append(mName) + return len(args) # 计算算数指令数量 diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py index 4744c07..4a968da 100644 --- a/Genius3/raw-feature-extractor/preprocessing_ida.py +++ b/Genius3/raw-feature-extractor/preprocessing_ida.py @@ -1,7 +1,7 @@ -# coding=utf-8 import os import pickle -import idc +from func import * +from idc import * import idaapi # 定义常量 @@ -12,6 +12,7 @@ CFG_EXTENSION = ".ida" GDL_EXTENSION = ".dot" ASM_EXTENSION = ".asm" + def preprocess(binary_name, workflow): cfg_path = os.path.join( INFECTED_DIR if workflow != "-1" else BENIGN_DIR, @@ -29,9 +30,9 @@ def preprocess(binary_name, workflow): if os.path.exists(cfg_path): idc.Exit(0) else: - analysis_flags = idc.GetShortPrm(idc.INF_START_AF) - analysis_flags &= ~idc.AF_IMMOFF - idc.SetShortPrm(idc.INF_START_AF, analysis_flags) + analysis_flags = idc.GetShortPrm(idc.INF_AF2) + analysis_flags &= ~ida_ida.AF_IMMOFF + idc.SetShortPrm(idc.INF_AF2, analysis_flags) idaapi.autoWait() @@ -47,17 +48,21 @@ def preprocess(binary_name, workflow): # 关闭IDA Pro idc.Exit(0) + def generate_cfg(binary_name, cfg_path): cfgs = get_func_cfgs_c(FirstSeg()) with open(cfg_path, 'wb') as cfg_file: pickle.dump(cfgs, cfg_file) + def generate_gdl(gdl_path): idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT) + def generate_asm(asm_path): idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0) + # 主函数 def main(): binary_name = idc.GetInputFile() @@ -68,6 +73,7 @@ def main(): return preprocess(binary_name, workflow) + # 如果是作为IDA Pro的脚本运行,调用主函数 if __name__ == "__main__": main() diff --git a/Genius3/raw-feature-extractor/read_idaFILE.py b/Genius3/raw-feature-extractor/read_idaFILE.py deleted file mode 100644 index aae5416..0000000 --- a/Genius3/raw-feature-extractor/read_idaFILE.py +++ /dev/null @@ -1,101 +0,0 @@ -# -*- coding: UTF-8 -*- -import sys -from matplotlib import pyplot as plt -import networkx as nx -import pickle -# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/') -# sys.path.insert(1, 'C:/Python27/Lib/site-packages') - - -def print_obj(obj): - # "打印对象的所有属性" - print(obj.__dict__) - - -# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。 -# sub_166C4 393 -if __name__ == '__main__': - testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida" - fr = open(testpath, 'r') - data = pickle.load(fr) #一个二进制文件的acfgs - fr.close() - - # print(type(data1)) - # print_obj(data1) - # print data1.raw_graph_list[393] - # print_obj(data1.raw_graph_list[393]) - # nx.draw(data1.raw_graph_list[393].g,with_labels=True) - # plt.show() - - print("一个二进制文件的所有函数的原始特征,list。") - print_obj(data) # acfg list - print("\n") - - print("一个函数的原始特征,由old_g(discovRe方法的ACFG),g(Genius方法的ACFG),fun_feature(表示函数级别的特征的向量)三部分构成") - print_obj(data.raw_graph_list[0]) # 一个函数的acfg - print("其中fun_features = 函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts") - # feature = data.raw_graph_list[0].fun_features - print("old_g:{}".format(data.raw_graph_list[0].old_g)) - print("g:{}".format(data.raw_graph_list[0].g)) - - - # G = data1.raw_graph_list[393].old_g - # print G.node[0] # G.node[i]是dict - # for key, value in G.node[0].items(): - # print('{key}:{value}'.format(key=key, value=value)) - - # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量 - G = data.raw_graph_list[0].g - print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量") - # print(G.node[0]) - # print("\n") - # 函数内所有基本快的特征 - for key, value in G.node.items(): - print('{}:{}'.format(key, value)) - - - - #oldg就是读取IDA的CFG,所以数量、方向等都一样;g根据old_g生成,也一样 - #old g - G = data.raw_graph_list[0].old_g - nx.draw(G, with_labels=True) - #plt.title('old_g') - plt.show() - - - # g - G = data.raw_graph_list[0].g - nx.draw(G, with_labels=True) - #plt.title('Genius_g') - plt.show() - - # draw graph with labels - pos = nx.spring_layout(G) - nx.draw(G, pos) - node_labels = nx.get_node_attributes(G, 'v') #networkx的node,由属性。g的属性为'v',意为原始特征的vector。old_g的属性见cfg_constructor.py - nx.draw_networkx_labels(G, pos, labels=node_labels) - #plt.title('Genius_g with raw feature vector') - plt.show() - - -# 1 function calls(本函数的函数调用指令(call jal jalr)数量)。。注意arm中没有这些指令 - -# 2 logic instructions ,本函数的逻辑运算指令数量。如and、or的数量 - -# 3 TransferIns 转移指令(如jmp arm中为mov)数量 - -# 4 LocalVariables 局部变量数量 - -# 5 BB basicblocks数量 - -# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征,这里暂时不管 - -# 7 IncommingCalls,调用本函数的指令数量 - -# 8 Intrs 指令数量 - -# 9 between 结构特征中的betweeness。 - -# 10 strings 字符串 - -# 11 consts 数字常量 \ No newline at end of file diff --git a/Genius3/search-engine/db.py b/Genius3/search-engine/db.py deleted file mode 100644 index bc6c864..0000000 --- a/Genius3/search-engine/db.py +++ /dev/null @@ -1,356 +0,0 @@ -import cPickle as pickle -from search import * -from nearpy import Engine -from nearpy.hashes import RandomDiscretizedProjections -from nearpy.filters import NearestFilter, UniqueFilter -from nearpy.distances import EuclideanDistance -from nearpy.distances import CosineDistance -from nearpy.hashes import RandomBinaryProjections -from nearpy.experiments import DistanceRatioExperiment -from redis import Redis -from nearpy.storage import RedisStorage -from feature import * -import numpy as np -import os -import pdb -import argparse -import time -import numpy as np -from refactoring import * -import pymongo -from pymongo import MongoClient - -def initDB(): - client = MongoClient() - client = MongoClient('localhost', 27017) - client = MongoClient('mongodb://localhost:27017/') - db = client.test_database - db = client['iot-encoding'] - return db - -db = initDB() -posts = db.posts - -class db: - - def __init__(self): - self.feature_list = {} - self.engine = None - - def loadHashmap(self, feature_size, result_n): - # Create redis storage adapter - redis_object = Redis(host='localhost', port=6379, db=0) - redis_storage = RedisStorage(redis_object) - pdb.set_trace() - try: - # Get hash config from redis - config = redis_storage.load_hash_configuration('test') - # Config is existing, create hash with None parameters - lshash = RandomBinaryProjections(None, None) - # Apply configuration loaded from redis - lshash.apply_config(config) - - except: - # Config is not existing, create hash from scratch, with 10 projections - lshash = RandomBinaryProjections('test', 0) - - - # Create engine for feature space of 100 dimensions and use our hash. - # This will set the dimension of the lshash only the first time, not when - # using the configuration loaded from redis. Use redis storage to store - # buckets. - nearest = NearestFilter(1000) - #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) - pdb.set_trace() - self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) - - # Do some stuff like indexing or querying with the engine... - - # Finally store hash configuration in redis for later use - redis_storage.store_hash_configuration(lshash) - - def appendToDB(self, binary_name, funcname, fvector, firmware_name=""): - if fvector is None: - return - #ftuple = tuple([fvector]) - self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname))) - - def batch_appendDB(self, binary_name, features, firmware_name=""): - for funcname in features: - feature = features[funcname] - #pdb.set_trace() - self.appendToDB(binary_name, funcname, feature, firmware_name) - - def batch_appendDBbyDir(self, base_dir): - cursor = posts.find({"firmware_name":"ddwrt-r21676_result"}) - i = 0 - for v in cursor: - print i - i+=1 - binary_name = v['binary_name'] - funcname = v['func_name'] - firmware_name = v['firmware_name'] - feature = v['fvector'] - self.appendToDB(binary_name, funcname, feature, firmware_name) - - def batch_appendDBbyDir1(self, base_dir): - image_dir = os.path.join(base_dir, "image") - firmware_featrues={} - bnum = 0 - fnum = 0 - i = 0 - pdb.set_trace() - for firmware_name in os.listdir(image_dir): - print firmware_name - firmware_featrues[firmware_name] = {} - firmware_dir = os.path.join(image_dir, firmware_name) - for binary_name in os.listdir(firmware_dir): - if binary_name.endswith(".features"): - bnum += 1 - featrues_dir = os.path.join(firmware_dir, binary_name) - featrues = pickle.load(open(featrues_dir, "r")) - for funcname in featrues: - fnum +=1 - #pdb.set_trace() - feature = featrues[funcname] - self.appendToDB(binary_name, funcname, feature, firmware_name) - del featrues - print("bnum ", bnum) - print("fnum ", fnum) - - def dump(self, base_dir): - db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping") - pickle.dump(self.feature_list, open(db_dir, 'w')) - db_dir = os.path.join(base_dir, "data/db/busybox.hashmap") - pickle.dump(self.engine, open(db_dir, 'w')) - - def loadDB(self, base_dir): - db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping") - self.feature_list = pickle.load(open(db_dir, 'r')) - db_dir = os.path.join(base_dir, "data/db/busybox.hashmap") - self.engine = pickle.load(open(db_dir, 'r')) - - def findF(self, binary_name, funcname): - x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]] - return x[0] - -def retrieveFeaturesByDir(n, base_dir): - firmware_featrues={} - i = 0 - for firmware_name in os.listdir(base_dir): - if firmware_name.endWith(".features"): - firmware_featrues[firmware_name] = {} - firmware_dir = os.path.join(base_dir, firmware_name) - if i > 0: - break - i += 1 - pdb.set_trace() - for binary_name in os.listdir(firmware_dir): - featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features") - featrues = pickle.load(open(featrues_dir, "r")) - for funcname in featrues: - feature = featrues[funcname] - self.appendToDB(firmware_name, binary_name, funcname, feature) - del featrues - -def retrieveFeatures(n, base_dir, filename, funcs): - feature_dic = {} - featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features") - featrues = pickle.load(open(featrues_dir, "r")) - #featuresx = retrieveFeaturesx(filename) - for name in featrues: - #if name in funcs: - x = featrues[name] - #+ featuresx[name] - feature_dic[name] = np.asarray(x) - return feature_dic - -def retrieveVuldb(base_input_dir): - vul_path = os.path.join(base_input_dir, "vul") - vul_db = pickle.load(open(vul_path, "r")) - return vul_db - - -def retrieveFeaturesx(filename): - ida_input_dir = os.path.join("./data/", filename + ".features") - featuresx = pickle.load(open(ida_input_dir, "r")) - return featuresx - -def retrieveQueries(n, base_dir, filename1, featrues_src): - queries = {} - featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features") - featrues = pickle.load(open(featrues_dir, "r")) - #featuresx = retrieveFeaturesx(filename1) - for name in featrues: - #if name in featrues_src: - x = featrues[name] - #+ featuresx[name] - queries[name] = np.asarray(x) - return queries - -def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1): - queries = {} - featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features") - featrues = pickle.load(open(featrues_dir, "r")) - for name in featrues: - #del featrues[name][5] - queries[name] = np.asarray(featrues[name]) - return queries - -def retrieveQuery(n, base_dir, filename, funcname): - featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features") - featrues = pickle.load(open(featrues_dir, "r")) - f = [featrues[v] for v in featrues if funcname in v ][0] - return np.asarray(f) - -def parse_command(): - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training") - parser.add_argument('--output_dir', type=str, help="output dir") - parser.add_argument("--filename1", type=str, help="the size of each graphlet") - parser.add_argument("--filename2", type=str, help="the size of each graphlet") - parser.add_argument("--size", type=int, help="the size of each graphlet") - #parser.add_argument("--size", type=int, help="the size of each graphlet") - args = parser.parse_args() - return args - -def loadFuncs(path): - funcs = {} - x86_dir = os.path.join(path, "func_candid") - #mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida") - fp = open(x86_dir,"r") - for line in fp: - items = line.split("\n") - funcname = items[0] - funcs[funcname] = 1 - return funcs - -def dump(path, featrues, queries): - fp = open(path + "/" + "matrix", 'w') - for name in featrues: - row = [] - row.append("x86") - row.append(name) - row += featrues[name] - fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row)) - for name in queries: - row = [] - row.append("mips") - row.append(name) - row += queries[name] - fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row)) - fp.close() - - -def queryBytwo(base_input_dir, filename1, filename2, n): - threthold = 50 - db_instance = db() - funcs = loadFuncs(base_input_dir) - db_instance.loadHashmap(n, 50000) - #pdb.set_trace() - featrues = retrieveFeatures(n, base_input_dir, filename1, funcs) - queries = retrieveQueries(n, base_input_dir, filename2, funcs) - #queries = refactoring(queries, featrues) - vul_db = retrieveVuldb(base_input_dir) - pdb.set_trace() - #dump(base_input_dir, featrues, queries) - #start = time.time() - #db_instance.batch_appendDBbyDir(base_input_dir) - #end = time.time() - #total = end - start - #print total - db_instance.batch_appendDB(filename1, featrues) - pdb.set_trace() - ranks = [] - times = [] - for threthold in xrange(1, 210, 10): - hit = [] - i = 0 - for name in queries: - #print i - i += 1 - ''' - if i == 1000: - print (sum(times)/len(times)) - pdb.set_trace() - print "s" - ''' - #if name not in vul_db['openssl']: - # continue - if name not in featrues: - continue - #pdb.set_trace() - query = queries[name] - #start = time.time() - x = db_instance.engine.neighbours(query) - #end = time.time() - #total = end - start - #times.append(total) - #print total - #pdb.set_trace() - try: - rank = [v for v in xrange(len(x)) if name in x[v][1]][0] - ranks.append((name, rank)) - if rank <= threthold: - hit.append(1) - else: - hit.append(0) - except: - #pdb.set_trace() - hit.append(0) - pass - #pdb.set_trace() - acc = sum(hit) * 1.0 / len(hit) - print acc - -def queryAll(base_dir, firmware_name, filename1, n): - threthold = 155 - db_instance = db() - db_instance.loadHashmap(n, 50000) - queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1) - start = time.time() - pdb.set_trace() - db_instance.batch_appendDBbyDir(n, base_dir) - end = time.time() - dur = end - start - print dur - pdb.set_trace() - hit = [] - i = 0 - times = [] - for name in queries: - print i - i += 1 - query = queries[name] - start = time.clock() - x = db_instance.engine.neighbours(query) - end = time.clock() - dur = end - start - times.append(dur) - #pdb.set_trace() - try: - rank = [v for v in xrange(len(x)) if name in x[v][1]] - if len(rank) > 1: - pdb.set_trace() - print "stop" - if rank[0] <= threthold: - hit.append(1) - else: - hit.append(0) - except: - hit.append(0) - - acc = sum(hit) * 1.0 / len(hit) - mean = np.mean(times) - std = np.std(times) - #pdb.set_trace() - print acc - -if __name__ == "__main__": - args = parse_command() - base_dir = args.base_input_dir - filename1 = args.filename1 - filename2 = args.filename2 - n = args.size - pdb.set_trace() - queryBytwo(base_dir, filename1, filename2, n) diff --git a/ida_file_cerate_malware.bat b/ida_file_cerate_malware.bat deleted file mode 100644 index cd555ed..0000000 --- a/ida_file_cerate_malware.bat +++ /dev/null @@ -1,16 +0,0 @@ -@echo off -setlocal EnableDelayedExpansion - - -set "FOLDER_PATH=D:\bishe\dataset\train_malware" - - - -for %%f in ("%FOLDER_PATH%\*") do ( - echo !time! %%f - D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f - -) - -endlocal -