批量化操作

This commit is contained in:
huihun 2024-03-01 16:11:26 +08:00
parent 0f1e3378a2
commit 548eedb292
14 changed files with 48 additions and 865 deletions

View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="malgraph" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="PyDocumentationSettings"> <component name="PyDocumentationSettings">

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="malgraph" project-jdk-type="Python SDK" />
</project> </project>

View File

@ -2,7 +2,7 @@
setlocal EnableDelayedExpansion setlocal EnableDelayedExpansion
set "FOLDER_PATH=D:\bishe\dataset\train_benign" set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"

View File

@ -0,0 +1,16 @@
@echo off
setlocal EnableDelayedExpansion
set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
for %%f in ("%FOLDER_PATH%\*") do (
echo !time! %%f
D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
)
endlocal

View File

@ -1,16 +0,0 @@
# -*- coding: UTF-8 -*-
import sys
from func import *
from raw_graphs import *
from idc import *
import os
import argparse
if __name__ == '__main__':
print "hello"
#
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
# -c 删除旧数据库 -A 自动分析,不显示对话框
# -B 相当于 -c -A

View File

@ -1,81 +0,0 @@
class HierarchicalGraphNeuralNetwork(nn.Module):
def __init__(self, external_vocab: Vocab):
super(HierarchicalGraphNeuralNetwork, self).__init__()
self.pool = 'global_max_pool'
# Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
cfg_filter_list =[200, 200]
cfg_filter_list.insert(0, 11)
self.cfg_filter_length = len(cfg_filter_list)
cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
i in range(self.cfg_filter_length - 1)]
cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
cfg_constructor = cfg_conv['constructor']
for i in range(self.cfg_filter_length - 1):
setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
self.dropout = nn.Dropout(p=0.2)
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
embedding_dim=cfg_filter_list[-1],
padding_idx=external_vocab.pad_idx)
fcg_filter_list = [200, 200]
fcg_filter_list.insert(0, cfg_filter_list[-1])
self.fcg_filter_length = len(fcg_filter_list)
fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
i in range(self.fcg_filter_length - 1)]
fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
fcg_constructor = fcg_conv['constructor']
for i in range(self.fcg_filter_length - 1):
setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
# Last Projection Function: gradually project with more linear layers
self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
self.last_activation = nn.Softmax(dim=1)
def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
bt_all_function_edges: list):
rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
fcg_list = []
fcg_internal_list = []
for idx_batch in range(len(real_bt_positions) - 1):
start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
idx_x_cfg = x_cfg_pool[start_pos: end_pos]
fcg_internal_list.append(idx_x_cfg)
idx_x_external = self.external_embedding_layer(
torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
idx_x_external = idx_x_external.squeeze(dim=0)
idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
idx_graph_data.validate()
fcg_list.append(idx_graph_data)
fcg_batch = Batch.from_data_list(fcg_list)
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim]
x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
batch_final = x_fcg_pool
# step last project to the number_of_classes (multiclass)
bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
bt_pred = self.last_activation(bt_final_embed)
return bt_pred
def forward_cfg_gnn(self, local_batch: Batch):
in_x, edge_index = local_batch.x, local_batch.edge_index
for i in range(self.cfg_filter_length - 1):
out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
out_x = torch.nn.functional.relu(out_x, inplace=True)
out_x = self.dropout(out_x)
in_x = out_x
local_batch.x = in_x
return local_batch
def forward_fcg_gnn(self, function_batch: Batch):
in_x, edge_index = function_batch.x, function_batch.edge_index
for i in range(self.fcg_filter_length - 1):
out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
out_x = torch.nn.functional.relu(out_x, inplace=True)
out_x = self.dropout(out_x)
in_x = out_x
function_batch.x = in_x
return function_batch

View File

@ -242,7 +242,5 @@ def convert_benign(overhaul):
if __name__ == '__main__': if __name__ == '__main__':
# convert(35, 69)
# convert_benign(True)
convert_benign(True) convert_benign(True)
convert_malware(True) convert_malware(True)

View File

@ -1,264 +0,0 @@
# coding=utf-8
#
# Reference Lister
#
# List all functions and all references to them in the current section.
#
# Implemented with the idautils module
#
import networkx as nx
import pdb
from graph_analysis_ida import *
from graph_property import *
# import wingdbstub
# wingdbstub.Ensure()
def get_funcs(ea):
funcs = {}
# Get current ea
# Loop from start to end in the current segment
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
func = get_func(funcea)
blocks = FlowChart(func)
funcs[funcname] = []
for bl in blocks:
start = bl.startEA
end = bl.endEA
funcs[funcname].append((start, end))
return funcs
# 似乎是没用的函数
# def get_funcs_for_discoverRe(ea):
# features = {}
# for funcea in Functions(SegStart(ea)):
# funcname = GetFunctionName(funcea)
# print(funcname)
# func = get_func(funcea)
# feature = get_discoverRe_feature(func)
# features[funcname] = feature
# return features
# 获取所有bb的11维属性特征
# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
def get_bb_features(func):
bb_features = []
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
for bl in blocks:
calls = calCalls(bl)
transferIns = calTransferIns(bl)
mathematicsIns = calArithmeticIns(bl)
logicIns = calLogicInstructions(bl)
cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
totalIns = calInsts(bl)
consts = getBBconsts(bl)
stringOrIntConsts = len(consts[0]) + len(consts[1])
bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
interruptIns, declareIns, totalIns, stringOrIntConsts])
return bb_features
def get_discoverRe_feature(func, icfg):
start = func.startEA
end = func.endEA
features = []
FunctionCalls = getFuncCalls(func)
# 1
features.append(FunctionCalls)
LogicInstr = getLogicInsts(func)
# 2
features.append(LogicInstr)
Transfer = getTransferInsts(func)
# 3
features.append(Transfer)
Locals = getLocalVariables(func)
# 4
features.append(Locals)
BB = getBasicBlocks(func)
# 5
features.append(BB)
Edges = len(icfg.edges())
# 6
features.append(Edges)
Incoming = getIncommingCalls(func)
# 7
features.append(Incoming)
# 8
Instrs = getIntrs(func)
features.append(Instrs)
between = retrieveGP(icfg)
# 9
features.append(between)
strings, consts = getfunc_consts(func)
# 10
features.append(strings)
# 11
features.append(consts)
return features
def get_func_names(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
funcs[funcname] = funcea
return funcs
def get_func_bases(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
funcs[funcea] = funcname
return funcs
def get_func_range(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
func = get_func(funcea)
funcs[funcname] = (func.startEA, func.endEA)
return funcs
def get_func_sequences(ea):
funcs_bodylist = {}
funcs = get_funcs(ea)
for funcname in funcs:
if funcname not in funcs_bodylist:
funcs_bodylist[funcname] = []
for start, end in funcs[funcname]:
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
funcs_bodylist[funcname].append(opcode)
inst_addr = NextHead(inst_addr)
return funcs_bodylist
def get_func_cfgs(ea):
func_cfglist = {}
i = 0
start, end = get_section('LOAD')
# print start, end
for funcea in Functions(SegStart(ea)):
if start <= funcea <= end:
funcname = GetFunctionName(funcea)
func = get_func(funcea)
print(i)
i += 1
try:
icfg = cfg.cfg_construct(func)
func_cfglist[funcname] = icfg
except:
pass
return func_cfglist
def get_section(t):
base = SegByName(t)
start = SegByBase(base)
end = SegEnd(start)
return start, end
def get_func_cfg_sequences(func_cfglist):
func_cfg_seqlist = {}
for funcname in func_cfglist:
func_cfg_seqlist[funcname] = {}
cfg = func_cfglist[funcname][0]
for start, end in cfg:
codesq = get_sequences(start, end)
func_cfg_seqlist[funcname][(start, end)] = codesq
return func_cfg_seqlist
def get_sequences(start, end):
seq = []
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
seq.append(opcode)
inst_addr = NextHead(inst_addr)
return seq
def get_stack_arg(func_addr):
print(func_addr)
args = []
stack = GetFrame(func_addr)
if not stack:
return []
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <= lastM:
mName = GetMemberName(stack, i)
mSize = GetMemberSize(stack, i)
if mSize:
i = i + mSize
else:
i = i + 4
if mName not in args and mName and ' s' not in mName and ' r' not in mName:
args.append(mName)
return args
# pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
def processDataSegs():
funcdata = {}
datafunc = {}
for n in xrange(idaapi.get_segm_qty()):
seg = idaapi.getnseg(n)
ea = seg.startEA
segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
start = idc.SegStart(ea)
end = idc.SegEnd(ea)
cur = start
while cur <= end:
refs = [v for v in DataRefsTo(cur)]
for fea in refs:
name = GetFunctionName(fea)
if len(name) == 0:
continue
if name not in funcdata:
funcdata[name] = [cur]
else:
funcdata[name].append(cur)
if cur not in datafunc:
datafunc[cur] = [name]
else:
datafunc[cur].append(name)
cur = NextHead(cur)
return funcdata, datafunc
def obtainDataRefs(callgraph):
datarefs = {}
funcdata, datafunc = processDataSegs()
for node in callgraph:
if node in funcdata:
datas = funcdata[node]
for dd in datas:
refs = datafunc[dd]
refs = list(set(refs))
if node in datarefs:
print(refs)
datarefs[node] += refs
datarefs[node] = list(set(datarefs[node]))
else:
datarefs[node] = refs
return datarefs

View File

@ -16,9 +16,7 @@ from raw_graphs import *
#from discovRe_feature.discovRe import * #from discovRe_feature.discovRe import *
from discovRe import * from discovRe import *
sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
#import wingdbstub
#wingdbstub.Ensure()

View File

@ -119,7 +119,6 @@ def getIncommingCalls(func):
def get_stackVariables(func_addr): def get_stackVariables(func_addr):
#print func_addr
args = [] args = []
stack = GetFrame(func_addr) stack = GetFrame(func_addr)
if not stack: if not stack:
@ -127,13 +126,13 @@ def get_stackVariables(func_addr):
firstM = GetFirstMember(stack) firstM = GetFirstMember(stack)
lastM = GetLastMember(stack) lastM = GetLastMember(stack)
i = firstM i = firstM
while i <=lastM: while i <= lastM:
mName = GetMemberName(stack,i) mName = GetMemberName(stack, i)
mSize = GetMemberSize(stack,i) mSize = GetMemberSize(stack, i)
if mSize: if mSize:
i = i + mSize i = i + mSize
else: else:
i = i+4 i = i + 4
if mName not in args and mName and 'var_' in mName: if mName not in args and mName and 'var_' in mName:
args.append(mName) args.append(mName)
return len(args) return len(args)

View File

@ -1,7 +1,7 @@
# coding=utf-8
import os import os
import pickle import pickle
import idc from func import *
from idc import *
import idaapi import idaapi
# 定义常量 # 定义常量
@ -12,6 +12,7 @@ CFG_EXTENSION = ".ida"
GDL_EXTENSION = ".dot" GDL_EXTENSION = ".dot"
ASM_EXTENSION = ".asm" ASM_EXTENSION = ".asm"
def preprocess(binary_name, workflow): def preprocess(binary_name, workflow):
cfg_path = os.path.join( cfg_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR, INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
@ -29,9 +30,9 @@ def preprocess(binary_name, workflow):
if os.path.exists(cfg_path): if os.path.exists(cfg_path):
idc.Exit(0) idc.Exit(0)
else: else:
analysis_flags = idc.GetShortPrm(idc.INF_START_AF) analysis_flags = idc.GetShortPrm(idc.INF_AF2)
analysis_flags &= ~idc.AF_IMMOFF analysis_flags &= ~ida_ida.AF_IMMOFF
idc.SetShortPrm(idc.INF_START_AF, analysis_flags) idc.SetShortPrm(idc.INF_AF2, analysis_flags)
idaapi.autoWait() idaapi.autoWait()
@ -47,17 +48,21 @@ def preprocess(binary_name, workflow):
# 关闭IDA Pro # 关闭IDA Pro
idc.Exit(0) idc.Exit(0)
def generate_cfg(binary_name, cfg_path): def generate_cfg(binary_name, cfg_path):
cfgs = get_func_cfgs_c(FirstSeg()) cfgs = get_func_cfgs_c(FirstSeg())
with open(cfg_path, 'wb') as cfg_file: with open(cfg_path, 'wb') as cfg_file:
pickle.dump(cfgs, cfg_file) pickle.dump(cfgs, cfg_file)
def generate_gdl(gdl_path): def generate_gdl(gdl_path):
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT) idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
def generate_asm(asm_path): def generate_asm(asm_path):
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0) idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
# 主函数 # 主函数
def main(): def main():
binary_name = idc.GetInputFile() binary_name = idc.GetInputFile()
@ -68,6 +73,7 @@ def main():
return return
preprocess(binary_name, workflow) preprocess(binary_name, workflow)
# 如果是作为IDA Pro的脚本运行调用主函数 # 如果是作为IDA Pro的脚本运行调用主函数
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,101 +0,0 @@
# -*- coding: UTF-8 -*-
import sys
from matplotlib import pyplot as plt
import networkx as nx
import pickle
# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
def print_obj(obj):
# "打印对象的所有属性"
print(obj.__dict__)
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。
# sub_166C4 393
if __name__ == '__main__':
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
fr = open(testpath, 'r')
data = pickle.load(fr) #一个二进制文件的acfgs
fr.close()
# print(type(data1))
# print_obj(data1)
# print data1.raw_graph_list[393]
# print_obj(data1.raw_graph_list[393])
# nx.draw(data1.raw_graph_list[393].g,with_labels=True)
# plt.show()
print("一个二进制文件的所有函数的原始特征list。")
print_obj(data) # acfg list
print("\n")
print("一个函数的原始特征由old_gdiscovRe方法的ACFGgGenius方法的ACFGfun_feature表示函数级别的特征的向量三部分构成")
print_obj(data.raw_graph_list[0]) # 一个函数的acfg
print("其中fun_features = 函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
# feature = data.raw_graph_list[0].fun_features
print("old_g:{}".format(data.raw_graph_list[0].old_g))
print("g:{}".format(data.raw_graph_list[0].g))
# G = data1.raw_graph_list[393].old_g
# print G.node[0] # G.node[i]是dict
# for key, value in G.node[0].items():
# print('{key}:{value}'.format(key=key, value=value))
# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
G = data.raw_graph_list[0].g
print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
# print(G.node[0])
# print("\n")
# 函数内所有基本快的特征
for key, value in G.node.items():
print('{}:{}'.format(key, value))
#oldg就是读取IDA的CFG所以数量、方向等都一样g根据old_g生成也一样
#old g
G = data.raw_graph_list[0].old_g
nx.draw(G, with_labels=True)
#plt.title('old_g')
plt.show()
# g
G = data.raw_graph_list[0].g
nx.draw(G, with_labels=True)
#plt.title('Genius_g')
plt.show()
# draw graph with labels
pos = nx.spring_layout(G)
nx.draw(G, pos)
node_labels = nx.get_node_attributes(G, 'v') #networkx的node由属性。g的属性为'v'意为原始特征的vector。old_g的属性见cfg_constructor.py
nx.draw_networkx_labels(G, pos, labels=node_labels)
#plt.title('Genius_g with raw feature vector')
plt.show()
# 1 function calls本函数的函数调用指令call jal jalr数量。。注意arm中没有这些指令
# 2 logic instructions 本函数的逻辑运算指令数量。如and、or的数量
# 3 TransferIns 转移指令如jmp arm中为mov数量
# 4 LocalVariables 局部变量数量
# 5 BB basicblocks数量
# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征这里暂时不管
# 7 IncommingCalls调用本函数的指令数量
# 8 Intrs 指令数量
# 9 between 结构特征中的betweeness。
# 10 strings 字符串
# 11 consts 数字常量

View File

@ -1,356 +0,0 @@
import cPickle as pickle
from search import *
from nearpy import Engine
from nearpy.hashes import RandomDiscretizedProjections
from nearpy.filters import NearestFilter, UniqueFilter
from nearpy.distances import EuclideanDistance
from nearpy.distances import CosineDistance
from nearpy.hashes import RandomBinaryProjections
from nearpy.experiments import DistanceRatioExperiment
from redis import Redis
from nearpy.storage import RedisStorage
from feature import *
import numpy as np
import os
import pdb
import argparse
import time
import numpy as np
from refactoring import *
import pymongo
from pymongo import MongoClient
def initDB():
client = MongoClient()
client = MongoClient('localhost', 27017)
client = MongoClient('mongodb://localhost:27017/')
db = client.test_database
db = client['iot-encoding']
return db
db = initDB()
posts = db.posts
class db:
def __init__(self):
self.feature_list = {}
self.engine = None
def loadHashmap(self, feature_size, result_n):
# Create redis storage adapter
redis_object = Redis(host='localhost', port=6379, db=0)
redis_storage = RedisStorage(redis_object)
pdb.set_trace()
try:
# Get hash config from redis
config = redis_storage.load_hash_configuration('test')
# Config is existing, create hash with None parameters
lshash = RandomBinaryProjections(None, None)
# Apply configuration loaded from redis
lshash.apply_config(config)
except:
# Config is not existing, create hash from scratch, with 10 projections
lshash = RandomBinaryProjections('test', 0)
# Create engine for feature space of 100 dimensions and use our hash.
# This will set the dimension of the lshash only the first time, not when
# using the configuration loaded from redis. Use redis storage to store
# buckets.
nearest = NearestFilter(1000)
#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
pdb.set_trace()
self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
# Do some stuff like indexing or querying with the engine...
# Finally store hash configuration in redis for later use
redis_storage.store_hash_configuration(lshash)
def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
if fvector is None:
return
#ftuple = tuple([fvector])
self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
def batch_appendDB(self, binary_name, features, firmware_name=""):
for funcname in features:
feature = features[funcname]
#pdb.set_trace()
self.appendToDB(binary_name, funcname, feature, firmware_name)
def batch_appendDBbyDir(self, base_dir):
cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
i = 0
for v in cursor:
print i
i+=1
binary_name = v['binary_name']
funcname = v['func_name']
firmware_name = v['firmware_name']
feature = v['fvector']
self.appendToDB(binary_name, funcname, feature, firmware_name)
def batch_appendDBbyDir1(self, base_dir):
image_dir = os.path.join(base_dir, "image")
firmware_featrues={}
bnum = 0
fnum = 0
i = 0
pdb.set_trace()
for firmware_name in os.listdir(image_dir):
print firmware_name
firmware_featrues[firmware_name] = {}
firmware_dir = os.path.join(image_dir, firmware_name)
for binary_name in os.listdir(firmware_dir):
if binary_name.endswith(".features"):
bnum += 1
featrues_dir = os.path.join(firmware_dir, binary_name)
featrues = pickle.load(open(featrues_dir, "r"))
for funcname in featrues:
fnum +=1
#pdb.set_trace()
feature = featrues[funcname]
self.appendToDB(binary_name, funcname, feature, firmware_name)
del featrues
print("bnum ", bnum)
print("fnum ", fnum)
def dump(self, base_dir):
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
pickle.dump(self.feature_list, open(db_dir, 'w'))
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
pickle.dump(self.engine, open(db_dir, 'w'))
def loadDB(self, base_dir):
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
self.feature_list = pickle.load(open(db_dir, 'r'))
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
self.engine = pickle.load(open(db_dir, 'r'))
def findF(self, binary_name, funcname):
x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
return x[0]
def retrieveFeaturesByDir(n, base_dir):
firmware_featrues={}
i = 0
for firmware_name in os.listdir(base_dir):
if firmware_name.endWith(".features"):
firmware_featrues[firmware_name] = {}
firmware_dir = os.path.join(base_dir, firmware_name)
if i > 0:
break
i += 1
pdb.set_trace()
for binary_name in os.listdir(firmware_dir):
featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
for funcname in featrues:
feature = featrues[funcname]
self.appendToDB(firmware_name, binary_name, funcname, feature)
del featrues
def retrieveFeatures(n, base_dir, filename, funcs):
feature_dic = {}
featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
#featuresx = retrieveFeaturesx(filename)
for name in featrues:
#if name in funcs:
x = featrues[name]
#+ featuresx[name]
feature_dic[name] = np.asarray(x)
return feature_dic
def retrieveVuldb(base_input_dir):
vul_path = os.path.join(base_input_dir, "vul")
vul_db = pickle.load(open(vul_path, "r"))
return vul_db
def retrieveFeaturesx(filename):
ida_input_dir = os.path.join("./data/", filename + ".features")
featuresx = pickle.load(open(ida_input_dir, "r"))
return featuresx
def retrieveQueries(n, base_dir, filename1, featrues_src):
queries = {}
featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
#featuresx = retrieveFeaturesx(filename1)
for name in featrues:
#if name in featrues_src:
x = featrues[name]
#+ featuresx[name]
queries[name] = np.asarray(x)
return queries
def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
queries = {}
featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
for name in featrues:
#del featrues[name][5]
queries[name] = np.asarray(featrues[name])
return queries
def retrieveQuery(n, base_dir, filename, funcname):
featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
f = [featrues[v] for v in featrues if funcname in v ][0]
return np.asarray(f)
def parse_command():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
parser.add_argument('--output_dir', type=str, help="output dir")
parser.add_argument("--filename1", type=str, help="the size of each graphlet")
parser.add_argument("--filename2", type=str, help="the size of each graphlet")
parser.add_argument("--size", type=int, help="the size of each graphlet")
#parser.add_argument("--size", type=int, help="the size of each graphlet")
args = parser.parse_args()
return args
def loadFuncs(path):
funcs = {}
x86_dir = os.path.join(path, "func_candid")
#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
fp = open(x86_dir,"r")
for line in fp:
items = line.split("\n")
funcname = items[0]
funcs[funcname] = 1
return funcs
def dump(path, featrues, queries):
fp = open(path + "/" + "matrix", 'w')
for name in featrues:
row = []
row.append("x86")
row.append(name)
row += featrues[name]
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
for name in queries:
row = []
row.append("mips")
row.append(name)
row += queries[name]
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
fp.close()
def queryBytwo(base_input_dir, filename1, filename2, n):
threthold = 50
db_instance = db()
funcs = loadFuncs(base_input_dir)
db_instance.loadHashmap(n, 50000)
#pdb.set_trace()
featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
queries = retrieveQueries(n, base_input_dir, filename2, funcs)
#queries = refactoring(queries, featrues)
vul_db = retrieveVuldb(base_input_dir)
pdb.set_trace()
#dump(base_input_dir, featrues, queries)
#start = time.time()
#db_instance.batch_appendDBbyDir(base_input_dir)
#end = time.time()
#total = end - start
#print total
db_instance.batch_appendDB(filename1, featrues)
pdb.set_trace()
ranks = []
times = []
for threthold in xrange(1, 210, 10):
hit = []
i = 0
for name in queries:
#print i
i += 1
'''
if i == 1000:
print (sum(times)/len(times))
pdb.set_trace()
print "s"
'''
#if name not in vul_db['openssl']:
# continue
if name not in featrues:
continue
#pdb.set_trace()
query = queries[name]
#start = time.time()
x = db_instance.engine.neighbours(query)
#end = time.time()
#total = end - start
#times.append(total)
#print total
#pdb.set_trace()
try:
rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
ranks.append((name, rank))
if rank <= threthold:
hit.append(1)
else:
hit.append(0)
except:
#pdb.set_trace()
hit.append(0)
pass
#pdb.set_trace()
acc = sum(hit) * 1.0 / len(hit)
print acc
def queryAll(base_dir, firmware_name, filename1, n):
threthold = 155
db_instance = db()
db_instance.loadHashmap(n, 50000)
queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
start = time.time()
pdb.set_trace()
db_instance.batch_appendDBbyDir(n, base_dir)
end = time.time()
dur = end - start
print dur
pdb.set_trace()
hit = []
i = 0
times = []
for name in queries:
print i
i += 1
query = queries[name]
start = time.clock()
x = db_instance.engine.neighbours(query)
end = time.clock()
dur = end - start
times.append(dur)
#pdb.set_trace()
try:
rank = [v for v in xrange(len(x)) if name in x[v][1]]
if len(rank) > 1:
pdb.set_trace()
print "stop"
if rank[0] <= threthold:
hit.append(1)
else:
hit.append(0)
except:
hit.append(0)
acc = sum(hit) * 1.0 / len(hit)
mean = np.mean(times)
std = np.std(times)
#pdb.set_trace()
print acc
if __name__ == "__main__":
args = parse_command()
base_dir = args.base_input_dir
filename1 = args.filename1
filename2 = args.filename2
n = args.size
pdb.set_trace()
queryBytwo(base_dir, filename1, filename2, n)

View File

@ -1,16 +0,0 @@
@echo off
setlocal EnableDelayedExpansion
set "FOLDER_PATH=D:\bishe\dataset\train_malware"
for %%f in ("%FOLDER_PATH%\*") do (
echo !time! %%f
D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
)
endlocal