批量化操作
This commit is contained in:
parent
0f1e3378a2
commit
548eedb292
@ -4,7 +4,7 @@
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="malgraph" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
|
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="malgraph" project-jdk-type="Python SDK" />
|
||||
</project>
|
@ -2,7 +2,7 @@
|
||||
setlocal EnableDelayedExpansion
|
||||
|
||||
|
||||
set "FOLDER_PATH=D:\bishe\dataset\train_benign"
|
||||
set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"
|
||||
|
||||
|
||||
|
16
Genius3/bat_file/malware/ida_file_cerate_malware.bat
Normal file
16
Genius3/bat_file/malware/ida_file_cerate_malware.bat
Normal file
@ -0,0 +1,16 @@
|
||||
@echo off
|
||||
setlocal EnableDelayedExpansion
|
||||
|
||||
|
||||
set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
|
||||
|
||||
|
||||
|
||||
for %%f in ("%FOLDER_PATH%\*") do (
|
||||
echo !time! %%f
|
||||
D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
|
||||
|
||||
)
|
||||
|
||||
endlocal
|
||||
|
@ -1,16 +0,0 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import sys
|
||||
|
||||
from func import *
|
||||
from raw_graphs import *
|
||||
from idc import *
|
||||
import os
|
||||
import argparse
|
||||
if __name__ == '__main__':
|
||||
print "hello"
|
||||
|
||||
#
|
||||
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
|
||||
# -c 删除旧数据库 -A 自动分析,不显示对话框
|
||||
# -B 相当于 -c -A
|
||||
|
@ -1,81 +0,0 @@
|
||||
class HierarchicalGraphNeuralNetwork(nn.Module):
|
||||
def __init__(self, external_vocab: Vocab):
|
||||
super(HierarchicalGraphNeuralNetwork, self).__init__()
|
||||
self.pool = 'global_max_pool'
|
||||
# Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
|
||||
cfg_filter_list =[200, 200]
|
||||
cfg_filter_list.insert(0, 11)
|
||||
self.cfg_filter_length = len(cfg_filter_list)
|
||||
cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
|
||||
i in range(self.cfg_filter_length - 1)]
|
||||
cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
|
||||
cfg_constructor = cfg_conv['constructor']
|
||||
for i in range(self.cfg_filter_length - 1):
|
||||
setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
|
||||
self.dropout = nn.Dropout(p=0.2)
|
||||
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
|
||||
self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
|
||||
embedding_dim=cfg_filter_list[-1],
|
||||
padding_idx=external_vocab.pad_idx)
|
||||
fcg_filter_list = [200, 200]
|
||||
fcg_filter_list.insert(0, cfg_filter_list[-1])
|
||||
self.fcg_filter_length = len(fcg_filter_list)
|
||||
fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
|
||||
i in range(self.fcg_filter_length - 1)]
|
||||
fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
|
||||
fcg_constructor = fcg_conv['constructor']
|
||||
for i in range(self.fcg_filter_length - 1):
|
||||
setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
|
||||
# Last Projection Function: gradually project with more linear layers
|
||||
self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
|
||||
self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
|
||||
self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
|
||||
self.last_activation = nn.Softmax(dim=1)
|
||||
|
||||
def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
|
||||
bt_all_function_edges: list):
|
||||
rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
|
||||
x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
|
||||
fcg_list = []
|
||||
fcg_internal_list = []
|
||||
for idx_batch in range(len(real_bt_positions) - 1):
|
||||
start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
|
||||
idx_x_cfg = x_cfg_pool[start_pos: end_pos]
|
||||
fcg_internal_list.append(idx_x_cfg)
|
||||
idx_x_external = self.external_embedding_layer(
|
||||
torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
|
||||
idx_x_external = idx_x_external.squeeze(dim=0)
|
||||
idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
|
||||
idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
|
||||
idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
|
||||
idx_graph_data.validate()
|
||||
fcg_list.append(idx_graph_data)
|
||||
fcg_batch = Batch.from_data_list(fcg_list)
|
||||
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
|
||||
rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim]
|
||||
x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
|
||||
batch_final = x_fcg_pool
|
||||
# step last project to the number_of_classes (multiclass)
|
||||
bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
|
||||
bt_pred = self.last_activation(bt_final_embed)
|
||||
return bt_pred
|
||||
|
||||
def forward_cfg_gnn(self, local_batch: Batch):
|
||||
in_x, edge_index = local_batch.x, local_batch.edge_index
|
||||
for i in range(self.cfg_filter_length - 1):
|
||||
out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
|
||||
out_x = torch.nn.functional.relu(out_x, inplace=True)
|
||||
out_x = self.dropout(out_x)
|
||||
in_x = out_x
|
||||
local_batch.x = in_x
|
||||
return local_batch
|
||||
|
||||
def forward_fcg_gnn(self, function_batch: Batch):
|
||||
in_x, edge_index = function_batch.x, function_batch.edge_index
|
||||
for i in range(self.fcg_filter_length - 1):
|
||||
out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
|
||||
out_x = torch.nn.functional.relu(out_x, inplace=True)
|
||||
out_x = self.dropout(out_x)
|
||||
in_x = out_x
|
||||
function_batch.x = in_x
|
||||
return function_batch
|
@ -242,7 +242,5 @@ def convert_benign(overhaul):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# convert(35, 69)
|
||||
# convert_benign(True)
|
||||
convert_benign(True)
|
||||
convert_malware(True)
|
||||
|
@ -1,264 +0,0 @@
|
||||
# coding=utf-8
|
||||
#
|
||||
# Reference Lister
|
||||
#
|
||||
# List all functions and all references to them in the current section.
|
||||
#
|
||||
# Implemented with the idautils module
|
||||
#
|
||||
import networkx as nx
|
||||
import pdb
|
||||
from graph_analysis_ida import *
|
||||
from graph_property import *
|
||||
|
||||
|
||||
# import wingdbstub
|
||||
# wingdbstub.Ensure()
|
||||
|
||||
def get_funcs(ea):
|
||||
funcs = {}
|
||||
# Get current ea
|
||||
# Loop from start to end in the current segment
|
||||
for funcea in Functions(SegStart(ea)):
|
||||
funcname = GetFunctionName(funcea)
|
||||
func = get_func(funcea)
|
||||
blocks = FlowChart(func)
|
||||
funcs[funcname] = []
|
||||
for bl in blocks:
|
||||
start = bl.startEA
|
||||
end = bl.endEA
|
||||
funcs[funcname].append((start, end))
|
||||
return funcs
|
||||
|
||||
|
||||
# 似乎是没用的函数
|
||||
# def get_funcs_for_discoverRe(ea):
|
||||
# features = {}
|
||||
# for funcea in Functions(SegStart(ea)):
|
||||
# funcname = GetFunctionName(funcea)
|
||||
# print(funcname)
|
||||
# func = get_func(funcea)
|
||||
# feature = get_discoverRe_feature(func)
|
||||
# features[funcname] = feature
|
||||
# return features
|
||||
|
||||
|
||||
# 获取所有bb的11维属性特征
|
||||
# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
|
||||
def get_bb_features(func):
|
||||
bb_features = []
|
||||
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
|
||||
for bl in blocks:
|
||||
calls = calCalls(bl)
|
||||
transferIns = calTransferIns(bl)
|
||||
mathematicsIns = calArithmeticIns(bl)
|
||||
logicIns = calLogicInstructions(bl)
|
||||
cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
|
||||
movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
|
||||
interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
|
||||
declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
|
||||
totalIns = calInsts(bl)
|
||||
consts = getBBconsts(bl)
|
||||
stringOrIntConsts = len(consts[0]) + len(consts[1])
|
||||
bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
|
||||
interruptIns, declareIns, totalIns, stringOrIntConsts])
|
||||
return bb_features
|
||||
|
||||
|
||||
def get_discoverRe_feature(func, icfg):
|
||||
start = func.startEA
|
||||
end = func.endEA
|
||||
features = []
|
||||
FunctionCalls = getFuncCalls(func)
|
||||
# 1
|
||||
features.append(FunctionCalls)
|
||||
LogicInstr = getLogicInsts(func)
|
||||
# 2
|
||||
features.append(LogicInstr)
|
||||
Transfer = getTransferInsts(func)
|
||||
# 3
|
||||
features.append(Transfer)
|
||||
Locals = getLocalVariables(func)
|
||||
# 4
|
||||
features.append(Locals)
|
||||
BB = getBasicBlocks(func)
|
||||
# 5
|
||||
features.append(BB)
|
||||
Edges = len(icfg.edges())
|
||||
# 6
|
||||
features.append(Edges)
|
||||
Incoming = getIncommingCalls(func)
|
||||
# 7
|
||||
features.append(Incoming)
|
||||
# 8
|
||||
Instrs = getIntrs(func)
|
||||
features.append(Instrs)
|
||||
between = retrieveGP(icfg)
|
||||
# 9
|
||||
features.append(between)
|
||||
|
||||
strings, consts = getfunc_consts(func)
|
||||
# 10
|
||||
features.append(strings)
|
||||
# 11
|
||||
features.append(consts)
|
||||
return features
|
||||
|
||||
|
||||
def get_func_names(ea):
|
||||
funcs = {}
|
||||
for funcea in Functions(SegStart(ea)):
|
||||
funcname = GetFunctionName(funcea)
|
||||
funcs[funcname] = funcea
|
||||
return funcs
|
||||
|
||||
|
||||
def get_func_bases(ea):
|
||||
funcs = {}
|
||||
for funcea in Functions(SegStart(ea)):
|
||||
funcname = GetFunctionName(funcea)
|
||||
funcs[funcea] = funcname
|
||||
return funcs
|
||||
|
||||
|
||||
def get_func_range(ea):
|
||||
funcs = {}
|
||||
for funcea in Functions(SegStart(ea)):
|
||||
funcname = GetFunctionName(funcea)
|
||||
func = get_func(funcea)
|
||||
funcs[funcname] = (func.startEA, func.endEA)
|
||||
return funcs
|
||||
|
||||
|
||||
def get_func_sequences(ea):
|
||||
funcs_bodylist = {}
|
||||
funcs = get_funcs(ea)
|
||||
for funcname in funcs:
|
||||
if funcname not in funcs_bodylist:
|
||||
funcs_bodylist[funcname] = []
|
||||
for start, end in funcs[funcname]:
|
||||
inst_addr = start
|
||||
while inst_addr <= end:
|
||||
opcode = GetMnem(inst_addr)
|
||||
funcs_bodylist[funcname].append(opcode)
|
||||
inst_addr = NextHead(inst_addr)
|
||||
return funcs_bodylist
|
||||
|
||||
|
||||
def get_func_cfgs(ea):
|
||||
func_cfglist = {}
|
||||
i = 0
|
||||
start, end = get_section('LOAD')
|
||||
# print start, end
|
||||
for funcea in Functions(SegStart(ea)):
|
||||
if start <= funcea <= end:
|
||||
funcname = GetFunctionName(funcea)
|
||||
func = get_func(funcea)
|
||||
print(i)
|
||||
i += 1
|
||||
try:
|
||||
icfg = cfg.cfg_construct(func)
|
||||
func_cfglist[funcname] = icfg
|
||||
except:
|
||||
pass
|
||||
|
||||
return func_cfglist
|
||||
|
||||
|
||||
def get_section(t):
|
||||
base = SegByName(t)
|
||||
start = SegByBase(base)
|
||||
end = SegEnd(start)
|
||||
return start, end
|
||||
|
||||
|
||||
def get_func_cfg_sequences(func_cfglist):
|
||||
func_cfg_seqlist = {}
|
||||
for funcname in func_cfglist:
|
||||
func_cfg_seqlist[funcname] = {}
|
||||
cfg = func_cfglist[funcname][0]
|
||||
for start, end in cfg:
|
||||
codesq = get_sequences(start, end)
|
||||
func_cfg_seqlist[funcname][(start, end)] = codesq
|
||||
|
||||
return func_cfg_seqlist
|
||||
|
||||
|
||||
def get_sequences(start, end):
|
||||
seq = []
|
||||
inst_addr = start
|
||||
while inst_addr <= end:
|
||||
opcode = GetMnem(inst_addr)
|
||||
seq.append(opcode)
|
||||
inst_addr = NextHead(inst_addr)
|
||||
return seq
|
||||
|
||||
|
||||
def get_stack_arg(func_addr):
|
||||
print(func_addr)
|
||||
args = []
|
||||
stack = GetFrame(func_addr)
|
||||
if not stack:
|
||||
return []
|
||||
firstM = GetFirstMember(stack)
|
||||
lastM = GetLastMember(stack)
|
||||
i = firstM
|
||||
while i <= lastM:
|
||||
mName = GetMemberName(stack, i)
|
||||
mSize = GetMemberSize(stack, i)
|
||||
if mSize:
|
||||
i = i + mSize
|
||||
else:
|
||||
i = i + 4
|
||||
if mName not in args and mName and ' s' not in mName and ' r' not in mName:
|
||||
args.append(mName)
|
||||
return args
|
||||
|
||||
# pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
|
||||
|
||||
|
||||
def processDataSegs():
|
||||
funcdata = {}
|
||||
datafunc = {}
|
||||
for n in xrange(idaapi.get_segm_qty()):
|
||||
seg = idaapi.getnseg(n)
|
||||
ea = seg.startEA
|
||||
segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
|
||||
if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
|
||||
start = idc.SegStart(ea)
|
||||
end = idc.SegEnd(ea)
|
||||
cur = start
|
||||
while cur <= end:
|
||||
refs = [v for v in DataRefsTo(cur)]
|
||||
for fea in refs:
|
||||
name = GetFunctionName(fea)
|
||||
if len(name) == 0:
|
||||
continue
|
||||
if name not in funcdata:
|
||||
funcdata[name] = [cur]
|
||||
else:
|
||||
funcdata[name].append(cur)
|
||||
if cur not in datafunc:
|
||||
datafunc[cur] = [name]
|
||||
else:
|
||||
datafunc[cur].append(name)
|
||||
cur = NextHead(cur)
|
||||
return funcdata, datafunc
|
||||
|
||||
|
||||
def obtainDataRefs(callgraph):
|
||||
datarefs = {}
|
||||
funcdata, datafunc = processDataSegs()
|
||||
for node in callgraph:
|
||||
if node in funcdata:
|
||||
datas = funcdata[node]
|
||||
for dd in datas:
|
||||
refs = datafunc[dd]
|
||||
refs = list(set(refs))
|
||||
if node in datarefs:
|
||||
print(refs)
|
||||
datarefs[node] += refs
|
||||
datarefs[node] = list(set(datarefs[node]))
|
||||
else:
|
||||
datarefs[node] = refs
|
||||
return datarefs
|
@ -16,9 +16,7 @@ from raw_graphs import *
|
||||
#from discovRe_feature.discovRe import *
|
||||
from discovRe import *
|
||||
|
||||
sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
|
||||
#import wingdbstub
|
||||
#wingdbstub.Ensure()
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -119,7 +119,6 @@ def getIncommingCalls(func):
|
||||
|
||||
|
||||
def get_stackVariables(func_addr):
|
||||
#print func_addr
|
||||
args = []
|
||||
stack = GetFrame(func_addr)
|
||||
if not stack:
|
||||
|
@ -1,7 +1,7 @@
|
||||
# coding=utf-8
|
||||
import os
|
||||
import pickle
|
||||
import idc
|
||||
from func import *
|
||||
from idc import *
|
||||
import idaapi
|
||||
|
||||
# 定义常量
|
||||
@ -12,6 +12,7 @@ CFG_EXTENSION = ".ida"
|
||||
GDL_EXTENSION = ".dot"
|
||||
ASM_EXTENSION = ".asm"
|
||||
|
||||
|
||||
def preprocess(binary_name, workflow):
|
||||
cfg_path = os.path.join(
|
||||
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
|
||||
@ -29,9 +30,9 @@ def preprocess(binary_name, workflow):
|
||||
if os.path.exists(cfg_path):
|
||||
idc.Exit(0)
|
||||
else:
|
||||
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
|
||||
analysis_flags &= ~idc.AF_IMMOFF
|
||||
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
|
||||
analysis_flags = idc.GetShortPrm(idc.INF_AF2)
|
||||
analysis_flags &= ~ida_ida.AF_IMMOFF
|
||||
idc.SetShortPrm(idc.INF_AF2, analysis_flags)
|
||||
|
||||
idaapi.autoWait()
|
||||
|
||||
@ -47,17 +48,21 @@ def preprocess(binary_name, workflow):
|
||||
# 关闭IDA Pro
|
||||
idc.Exit(0)
|
||||
|
||||
|
||||
def generate_cfg(binary_name, cfg_path):
|
||||
cfgs = get_func_cfgs_c(FirstSeg())
|
||||
with open(cfg_path, 'wb') as cfg_file:
|
||||
pickle.dump(cfgs, cfg_file)
|
||||
|
||||
|
||||
def generate_gdl(gdl_path):
|
||||
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
|
||||
|
||||
|
||||
def generate_asm(asm_path):
|
||||
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
|
||||
|
||||
|
||||
# 主函数
|
||||
def main():
|
||||
binary_name = idc.GetInputFile()
|
||||
@ -68,6 +73,7 @@ def main():
|
||||
return
|
||||
preprocess(binary_name, workflow)
|
||||
|
||||
|
||||
# 如果是作为IDA Pro的脚本运行,调用主函数
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -1,101 +0,0 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import sys
|
||||
from matplotlib import pyplot as plt
|
||||
import networkx as nx
|
||||
import pickle
|
||||
# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
|
||||
# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
|
||||
|
||||
|
||||
def print_obj(obj):
|
||||
# "打印对象的所有属性"
|
||||
print(obj.__dict__)
|
||||
|
||||
|
||||
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。
|
||||
# sub_166C4 393
|
||||
if __name__ == '__main__':
|
||||
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
|
||||
fr = open(testpath, 'r')
|
||||
data = pickle.load(fr) #一个二进制文件的acfgs
|
||||
fr.close()
|
||||
|
||||
# print(type(data1))
|
||||
# print_obj(data1)
|
||||
# print data1.raw_graph_list[393]
|
||||
# print_obj(data1.raw_graph_list[393])
|
||||
# nx.draw(data1.raw_graph_list[393].g,with_labels=True)
|
||||
# plt.show()
|
||||
|
||||
print("一个二进制文件的所有函数的原始特征,list。")
|
||||
print_obj(data) # acfg list
|
||||
print("\n")
|
||||
|
||||
print("一个函数的原始特征,由old_g(discovRe方法的ACFG),g(Genius方法的ACFG),fun_feature(表示函数级别的特征的向量)三部分构成")
|
||||
print_obj(data.raw_graph_list[0]) # 一个函数的acfg
|
||||
print("其中fun_features = 函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
|
||||
# feature = data.raw_graph_list[0].fun_features
|
||||
print("old_g:{}".format(data.raw_graph_list[0].old_g))
|
||||
print("g:{}".format(data.raw_graph_list[0].g))
|
||||
|
||||
|
||||
# G = data1.raw_graph_list[393].old_g
|
||||
# print G.node[0] # G.node[i]是dict
|
||||
# for key, value in G.node[0].items():
|
||||
# print('{key}:{value}'.format(key=key, value=value))
|
||||
|
||||
# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
|
||||
G = data.raw_graph_list[0].g
|
||||
print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
|
||||
# print(G.node[0])
|
||||
# print("\n")
|
||||
# 函数内所有基本快的特征
|
||||
for key, value in G.node.items():
|
||||
print('{}:{}'.format(key, value))
|
||||
|
||||
|
||||
|
||||
#oldg就是读取IDA的CFG,所以数量、方向等都一样;g根据old_g生成,也一样
|
||||
#old g
|
||||
G = data.raw_graph_list[0].old_g
|
||||
nx.draw(G, with_labels=True)
|
||||
#plt.title('old_g')
|
||||
plt.show()
|
||||
|
||||
|
||||
# g
|
||||
G = data.raw_graph_list[0].g
|
||||
nx.draw(G, with_labels=True)
|
||||
#plt.title('Genius_g')
|
||||
plt.show()
|
||||
|
||||
# draw graph with labels
|
||||
pos = nx.spring_layout(G)
|
||||
nx.draw(G, pos)
|
||||
node_labels = nx.get_node_attributes(G, 'v') #networkx的node,由属性。g的属性为'v',意为原始特征的vector。old_g的属性见cfg_constructor.py
|
||||
nx.draw_networkx_labels(G, pos, labels=node_labels)
|
||||
#plt.title('Genius_g with raw feature vector')
|
||||
plt.show()
|
||||
|
||||
|
||||
# 1 function calls(本函数的函数调用指令(call jal jalr)数量)。。注意arm中没有这些指令
|
||||
|
||||
# 2 logic instructions ,本函数的逻辑运算指令数量。如and、or的数量
|
||||
|
||||
# 3 TransferIns 转移指令(如jmp arm中为mov)数量
|
||||
|
||||
# 4 LocalVariables 局部变量数量
|
||||
|
||||
# 5 BB basicblocks数量
|
||||
|
||||
# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征,这里暂时不管
|
||||
|
||||
# 7 IncommingCalls,调用本函数的指令数量
|
||||
|
||||
# 8 Intrs 指令数量
|
||||
|
||||
# 9 between 结构特征中的betweeness。
|
||||
|
||||
# 10 strings 字符串
|
||||
|
||||
# 11 consts 数字常量
|
@ -1,356 +0,0 @@
|
||||
import cPickle as pickle
|
||||
from search import *
|
||||
from nearpy import Engine
|
||||
from nearpy.hashes import RandomDiscretizedProjections
|
||||
from nearpy.filters import NearestFilter, UniqueFilter
|
||||
from nearpy.distances import EuclideanDistance
|
||||
from nearpy.distances import CosineDistance
|
||||
from nearpy.hashes import RandomBinaryProjections
|
||||
from nearpy.experiments import DistanceRatioExperiment
|
||||
from redis import Redis
|
||||
from nearpy.storage import RedisStorage
|
||||
from feature import *
|
||||
import numpy as np
|
||||
import os
|
||||
import pdb
|
||||
import argparse
|
||||
import time
|
||||
import numpy as np
|
||||
from refactoring import *
|
||||
import pymongo
|
||||
from pymongo import MongoClient
|
||||
|
||||
def initDB():
|
||||
client = MongoClient()
|
||||
client = MongoClient('localhost', 27017)
|
||||
client = MongoClient('mongodb://localhost:27017/')
|
||||
db = client.test_database
|
||||
db = client['iot-encoding']
|
||||
return db
|
||||
|
||||
db = initDB()
|
||||
posts = db.posts
|
||||
|
||||
class db:
|
||||
|
||||
def __init__(self):
|
||||
self.feature_list = {}
|
||||
self.engine = None
|
||||
|
||||
def loadHashmap(self, feature_size, result_n):
|
||||
# Create redis storage adapter
|
||||
redis_object = Redis(host='localhost', port=6379, db=0)
|
||||
redis_storage = RedisStorage(redis_object)
|
||||
pdb.set_trace()
|
||||
try:
|
||||
# Get hash config from redis
|
||||
config = redis_storage.load_hash_configuration('test')
|
||||
# Config is existing, create hash with None parameters
|
||||
lshash = RandomBinaryProjections(None, None)
|
||||
# Apply configuration loaded from redis
|
||||
lshash.apply_config(config)
|
||||
|
||||
except:
|
||||
# Config is not existing, create hash from scratch, with 10 projections
|
||||
lshash = RandomBinaryProjections('test', 0)
|
||||
|
||||
|
||||
# Create engine for feature space of 100 dimensions and use our hash.
|
||||
# This will set the dimension of the lshash only the first time, not when
|
||||
# using the configuration loaded from redis. Use redis storage to store
|
||||
# buckets.
|
||||
nearest = NearestFilter(1000)
|
||||
#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
|
||||
pdb.set_trace()
|
||||
self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
|
||||
|
||||
# Do some stuff like indexing or querying with the engine...
|
||||
|
||||
# Finally store hash configuration in redis for later use
|
||||
redis_storage.store_hash_configuration(lshash)
|
||||
|
||||
def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
|
||||
if fvector is None:
|
||||
return
|
||||
#ftuple = tuple([fvector])
|
||||
self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
|
||||
|
||||
def batch_appendDB(self, binary_name, features, firmware_name=""):
|
||||
for funcname in features:
|
||||
feature = features[funcname]
|
||||
#pdb.set_trace()
|
||||
self.appendToDB(binary_name, funcname, feature, firmware_name)
|
||||
|
||||
def batch_appendDBbyDir(self, base_dir):
|
||||
cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
|
||||
i = 0
|
||||
for v in cursor:
|
||||
print i
|
||||
i+=1
|
||||
binary_name = v['binary_name']
|
||||
funcname = v['func_name']
|
||||
firmware_name = v['firmware_name']
|
||||
feature = v['fvector']
|
||||
self.appendToDB(binary_name, funcname, feature, firmware_name)
|
||||
|
||||
def batch_appendDBbyDir1(self, base_dir):
|
||||
image_dir = os.path.join(base_dir, "image")
|
||||
firmware_featrues={}
|
||||
bnum = 0
|
||||
fnum = 0
|
||||
i = 0
|
||||
pdb.set_trace()
|
||||
for firmware_name in os.listdir(image_dir):
|
||||
print firmware_name
|
||||
firmware_featrues[firmware_name] = {}
|
||||
firmware_dir = os.path.join(image_dir, firmware_name)
|
||||
for binary_name in os.listdir(firmware_dir):
|
||||
if binary_name.endswith(".features"):
|
||||
bnum += 1
|
||||
featrues_dir = os.path.join(firmware_dir, binary_name)
|
||||
featrues = pickle.load(open(featrues_dir, "r"))
|
||||
for funcname in featrues:
|
||||
fnum +=1
|
||||
#pdb.set_trace()
|
||||
feature = featrues[funcname]
|
||||
self.appendToDB(binary_name, funcname, feature, firmware_name)
|
||||
del featrues
|
||||
print("bnum ", bnum)
|
||||
print("fnum ", fnum)
|
||||
|
||||
def dump(self, base_dir):
|
||||
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
|
||||
pickle.dump(self.feature_list, open(db_dir, 'w'))
|
||||
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
|
||||
pickle.dump(self.engine, open(db_dir, 'w'))
|
||||
|
||||
def loadDB(self, base_dir):
|
||||
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
|
||||
self.feature_list = pickle.load(open(db_dir, 'r'))
|
||||
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
|
||||
self.engine = pickle.load(open(db_dir, 'r'))
|
||||
|
||||
def findF(self, binary_name, funcname):
|
||||
x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
|
||||
return x[0]
|
||||
|
||||
def retrieveFeaturesByDir(n, base_dir):
|
||||
firmware_featrues={}
|
||||
i = 0
|
||||
for firmware_name in os.listdir(base_dir):
|
||||
if firmware_name.endWith(".features"):
|
||||
firmware_featrues[firmware_name] = {}
|
||||
firmware_dir = os.path.join(base_dir, firmware_name)
|
||||
if i > 0:
|
||||
break
|
||||
i += 1
|
||||
pdb.set_trace()
|
||||
for binary_name in os.listdir(firmware_dir):
|
||||
featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
|
||||
featrues = pickle.load(open(featrues_dir, "r"))
|
||||
for funcname in featrues:
|
||||
feature = featrues[funcname]
|
||||
self.appendToDB(firmware_name, binary_name, funcname, feature)
|
||||
del featrues
|
||||
|
||||
def retrieveFeatures(n, base_dir, filename, funcs):
|
||||
feature_dic = {}
|
||||
featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
|
||||
featrues = pickle.load(open(featrues_dir, "r"))
|
||||
#featuresx = retrieveFeaturesx(filename)
|
||||
for name in featrues:
|
||||
#if name in funcs:
|
||||
x = featrues[name]
|
||||
#+ featuresx[name]
|
||||
feature_dic[name] = np.asarray(x)
|
||||
return feature_dic
|
||||
|
||||
def retrieveVuldb(base_input_dir):
|
||||
vul_path = os.path.join(base_input_dir, "vul")
|
||||
vul_db = pickle.load(open(vul_path, "r"))
|
||||
return vul_db
|
||||
|
||||
|
||||
def retrieveFeaturesx(filename):
|
||||
ida_input_dir = os.path.join("./data/", filename + ".features")
|
||||
featuresx = pickle.load(open(ida_input_dir, "r"))
|
||||
return featuresx
|
||||
|
||||
def retrieveQueries(n, base_dir, filename1, featrues_src):
|
||||
queries = {}
|
||||
featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
|
||||
featrues = pickle.load(open(featrues_dir, "r"))
|
||||
#featuresx = retrieveFeaturesx(filename1)
|
||||
for name in featrues:
|
||||
#if name in featrues_src:
|
||||
x = featrues[name]
|
||||
#+ featuresx[name]
|
||||
queries[name] = np.asarray(x)
|
||||
return queries
|
||||
|
||||
def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
|
||||
queries = {}
|
||||
featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
|
||||
featrues = pickle.load(open(featrues_dir, "r"))
|
||||
for name in featrues:
|
||||
#del featrues[name][5]
|
||||
queries[name] = np.asarray(featrues[name])
|
||||
return queries
|
||||
|
||||
def retrieveQuery(n, base_dir, filename, funcname):
|
||||
featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
|
||||
featrues = pickle.load(open(featrues_dir, "r"))
|
||||
f = [featrues[v] for v in featrues if funcname in v ][0]
|
||||
return np.asarray(f)
|
||||
|
||||
def parse_command():
|
||||
parser = argparse.ArgumentParser(description='Process some integers.')
|
||||
parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
|
||||
parser.add_argument('--output_dir', type=str, help="output dir")
|
||||
parser.add_argument("--filename1", type=str, help="the size of each graphlet")
|
||||
parser.add_argument("--filename2", type=str, help="the size of each graphlet")
|
||||
parser.add_argument("--size", type=int, help="the size of each graphlet")
|
||||
#parser.add_argument("--size", type=int, help="the size of each graphlet")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
def loadFuncs(path):
|
||||
funcs = {}
|
||||
x86_dir = os.path.join(path, "func_candid")
|
||||
#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
|
||||
fp = open(x86_dir,"r")
|
||||
for line in fp:
|
||||
items = line.split("\n")
|
||||
funcname = items[0]
|
||||
funcs[funcname] = 1
|
||||
return funcs
|
||||
|
||||
def dump(path, featrues, queries):
|
||||
fp = open(path + "/" + "matrix", 'w')
|
||||
for name in featrues:
|
||||
row = []
|
||||
row.append("x86")
|
||||
row.append(name)
|
||||
row += featrues[name]
|
||||
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
|
||||
for name in queries:
|
||||
row = []
|
||||
row.append("mips")
|
||||
row.append(name)
|
||||
row += queries[name]
|
||||
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
|
||||
fp.close()
|
||||
|
||||
|
||||
def queryBytwo(base_input_dir, filename1, filename2, n):
|
||||
threthold = 50
|
||||
db_instance = db()
|
||||
funcs = loadFuncs(base_input_dir)
|
||||
db_instance.loadHashmap(n, 50000)
|
||||
#pdb.set_trace()
|
||||
featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
|
||||
queries = retrieveQueries(n, base_input_dir, filename2, funcs)
|
||||
#queries = refactoring(queries, featrues)
|
||||
vul_db = retrieveVuldb(base_input_dir)
|
||||
pdb.set_trace()
|
||||
#dump(base_input_dir, featrues, queries)
|
||||
#start = time.time()
|
||||
#db_instance.batch_appendDBbyDir(base_input_dir)
|
||||
#end = time.time()
|
||||
#total = end - start
|
||||
#print total
|
||||
db_instance.batch_appendDB(filename1, featrues)
|
||||
pdb.set_trace()
|
||||
ranks = []
|
||||
times = []
|
||||
for threthold in xrange(1, 210, 10):
|
||||
hit = []
|
||||
i = 0
|
||||
for name in queries:
|
||||
#print i
|
||||
i += 1
|
||||
'''
|
||||
if i == 1000:
|
||||
print (sum(times)/len(times))
|
||||
pdb.set_trace()
|
||||
print "s"
|
||||
'''
|
||||
#if name not in vul_db['openssl']:
|
||||
# continue
|
||||
if name not in featrues:
|
||||
continue
|
||||
#pdb.set_trace()
|
||||
query = queries[name]
|
||||
#start = time.time()
|
||||
x = db_instance.engine.neighbours(query)
|
||||
#end = time.time()
|
||||
#total = end - start
|
||||
#times.append(total)
|
||||
#print total
|
||||
#pdb.set_trace()
|
||||
try:
|
||||
rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
|
||||
ranks.append((name, rank))
|
||||
if rank <= threthold:
|
||||
hit.append(1)
|
||||
else:
|
||||
hit.append(0)
|
||||
except:
|
||||
#pdb.set_trace()
|
||||
hit.append(0)
|
||||
pass
|
||||
#pdb.set_trace()
|
||||
acc = sum(hit) * 1.0 / len(hit)
|
||||
print acc
|
||||
|
||||
def queryAll(base_dir, firmware_name, filename1, n):
|
||||
threthold = 155
|
||||
db_instance = db()
|
||||
db_instance.loadHashmap(n, 50000)
|
||||
queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
|
||||
start = time.time()
|
||||
pdb.set_trace()
|
||||
db_instance.batch_appendDBbyDir(n, base_dir)
|
||||
end = time.time()
|
||||
dur = end - start
|
||||
print dur
|
||||
pdb.set_trace()
|
||||
hit = []
|
||||
i = 0
|
||||
times = []
|
||||
for name in queries:
|
||||
print i
|
||||
i += 1
|
||||
query = queries[name]
|
||||
start = time.clock()
|
||||
x = db_instance.engine.neighbours(query)
|
||||
end = time.clock()
|
||||
dur = end - start
|
||||
times.append(dur)
|
||||
#pdb.set_trace()
|
||||
try:
|
||||
rank = [v for v in xrange(len(x)) if name in x[v][1]]
|
||||
if len(rank) > 1:
|
||||
pdb.set_trace()
|
||||
print "stop"
|
||||
if rank[0] <= threthold:
|
||||
hit.append(1)
|
||||
else:
|
||||
hit.append(0)
|
||||
except:
|
||||
hit.append(0)
|
||||
|
||||
acc = sum(hit) * 1.0 / len(hit)
|
||||
mean = np.mean(times)
|
||||
std = np.std(times)
|
||||
#pdb.set_trace()
|
||||
print acc
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_command()
|
||||
base_dir = args.base_input_dir
|
||||
filename1 = args.filename1
|
||||
filename2 = args.filename2
|
||||
n = args.size
|
||||
pdb.set_trace()
|
||||
queryBytwo(base_dir, filename1, filename2, n)
|
@ -1,16 +0,0 @@
|
||||
@echo off
|
||||
setlocal EnableDelayedExpansion
|
||||
|
||||
|
||||
set "FOLDER_PATH=D:\bishe\dataset\train_malware"
|
||||
|
||||
|
||||
|
||||
for %%f in ("%FOLDER_PATH%\*") do (
|
||||
echo !time! %%f
|
||||
D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
|
||||
|
||||
)
|
||||
|
||||
endlocal
|
||||
|
Loading…
Reference in New Issue
Block a user