Compare commits

...

6 Commits

Author SHA1 Message Date
96971fd0a7 Merge branch 'master' into detached
# Conflicts:
#	Genius3/raw-feature-extractor/convert_pkl_to_json.py
#	Genius3/raw-feature-extractor/preprocessing_ida.py
2024-03-03 16:59:09 +08:00
1317c38bc6 log写入 2024-03-03 16:39:19 +08:00
835b23f7be 加入多线程与日志记录 2024-03-03 14:34:47 +08:00
93db227535 加入多线程与日志记录 2024-03-03 14:34:19 +08:00
65d25d42de py2.7可行版本 2024-03-01 16:42:02 +08:00
548eedb292 批量化操作 2024-03-01 16:11:26 +08:00
16 changed files with 189 additions and 920 deletions

View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="malgraph" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="malgraph" project-jdk-type="Python SDK" />
</project>

View File

@ -2,7 +2,7 @@
setlocal EnableDelayedExpansion
set "FOLDER_PATH=D:\bishe\dataset\train_benign"
set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"

View File

@ -0,0 +1,16 @@
@echo off
setlocal EnableDelayedExpansion
set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
for %%f in ("%FOLDER_PATH%\*") do (
echo !time! %%f
D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
)
endlocal

View File

@ -1,16 +0,0 @@
# -*- coding: UTF-8 -*-
import sys
from func import *
from raw_graphs import *
from idc import *
import os
import argparse
if __name__ == '__main__':
print "hello"
#
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
# -c 删除旧数据库 -A 自动分析,不显示对话框
# -B 相当于 -c -A

View File

@ -1,81 +0,0 @@
class HierarchicalGraphNeuralNetwork(nn.Module):
def __init__(self, external_vocab: Vocab):
super(HierarchicalGraphNeuralNetwork, self).__init__()
self.pool = 'global_max_pool'
# Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
cfg_filter_list =[200, 200]
cfg_filter_list.insert(0, 11)
self.cfg_filter_length = len(cfg_filter_list)
cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
i in range(self.cfg_filter_length - 1)]
cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
cfg_constructor = cfg_conv['constructor']
for i in range(self.cfg_filter_length - 1):
setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
self.dropout = nn.Dropout(p=0.2)
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
embedding_dim=cfg_filter_list[-1],
padding_idx=external_vocab.pad_idx)
fcg_filter_list = [200, 200]
fcg_filter_list.insert(0, cfg_filter_list[-1])
self.fcg_filter_length = len(fcg_filter_list)
fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
i in range(self.fcg_filter_length - 1)]
fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
fcg_constructor = fcg_conv['constructor']
for i in range(self.fcg_filter_length - 1):
setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
# Last Projection Function: gradually project with more linear layers
self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
self.last_activation = nn.Softmax(dim=1)
def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
bt_all_function_edges: list):
rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
fcg_list = []
fcg_internal_list = []
for idx_batch in range(len(real_bt_positions) - 1):
start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
idx_x_cfg = x_cfg_pool[start_pos: end_pos]
fcg_internal_list.append(idx_x_cfg)
idx_x_external = self.external_embedding_layer(
torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
idx_x_external = idx_x_external.squeeze(dim=0)
idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
idx_graph_data.validate()
fcg_list.append(idx_graph_data)
fcg_batch = Batch.from_data_list(fcg_list)
# Hierarchical 2: Function Call Graph (FCG) embedding and pooling
rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim]
x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
batch_final = x_fcg_pool
# step last project to the number_of_classes (multiclass)
bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
bt_pred = self.last_activation(bt_final_embed)
return bt_pred
def forward_cfg_gnn(self, local_batch: Batch):
in_x, edge_index = local_batch.x, local_batch.edge_index
for i in range(self.cfg_filter_length - 1):
out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
out_x = torch.nn.functional.relu(out_x, inplace=True)
out_x = self.dropout(out_x)
in_x = out_x
local_batch.x = in_x
return local_batch
def forward_fcg_gnn(self, function_batch: Batch):
in_x, edge_index = function_batch.x, function_batch.edge_index
for i in range(self.fcg_filter_length - 1):
out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
out_x = torch.nn.functional.relu(out_x, inplace=True)
out_x = self.dropout(out_x)
in_x = out_x
function_batch.x = in_x
return function_batch

View File

@ -1,264 +0,0 @@
# coding=utf-8
#
# Reference Lister
#
# List all functions and all references to them in the current section.
#
# Implemented with the idautils module
#
import networkx as nx
import pdb
from graph_analysis_ida import *
from graph_property import *
# import wingdbstub
# wingdbstub.Ensure()
def get_funcs(ea):
funcs = {}
# Get current ea
# Loop from start to end in the current segment
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
func = get_func(funcea)
blocks = FlowChart(func)
funcs[funcname] = []
for bl in blocks:
start = bl.startEA
end = bl.endEA
funcs[funcname].append((start, end))
return funcs
# 似乎是没用的函数
# def get_funcs_for_discoverRe(ea):
# features = {}
# for funcea in Functions(SegStart(ea)):
# funcname = GetFunctionName(funcea)
# print(funcname)
# func = get_func(funcea)
# feature = get_discoverRe_feature(func)
# features[funcname] = feature
# return features
# 获取所有bb的11维属性特征
# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
def get_bb_features(func):
bb_features = []
blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
for bl in blocks:
calls = calCalls(bl)
transferIns = calTransferIns(bl)
mathematicsIns = calArithmeticIns(bl)
logicIns = calLogicInstructions(bl)
cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
totalIns = calInsts(bl)
consts = getBBconsts(bl)
stringOrIntConsts = len(consts[0]) + len(consts[1])
bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
interruptIns, declareIns, totalIns, stringOrIntConsts])
return bb_features
def get_discoverRe_feature(func, icfg):
start = func.startEA
end = func.endEA
features = []
FunctionCalls = getFuncCalls(func)
# 1
features.append(FunctionCalls)
LogicInstr = getLogicInsts(func)
# 2
features.append(LogicInstr)
Transfer = getTransferInsts(func)
# 3
features.append(Transfer)
Locals = getLocalVariables(func)
# 4
features.append(Locals)
BB = getBasicBlocks(func)
# 5
features.append(BB)
Edges = len(icfg.edges())
# 6
features.append(Edges)
Incoming = getIncommingCalls(func)
# 7
features.append(Incoming)
# 8
Instrs = getIntrs(func)
features.append(Instrs)
between = retrieveGP(icfg)
# 9
features.append(between)
strings, consts = getfunc_consts(func)
# 10
features.append(strings)
# 11
features.append(consts)
return features
def get_func_names(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
funcs[funcname] = funcea
return funcs
def get_func_bases(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
funcs[funcea] = funcname
return funcs
def get_func_range(ea):
funcs = {}
for funcea in Functions(SegStart(ea)):
funcname = GetFunctionName(funcea)
func = get_func(funcea)
funcs[funcname] = (func.startEA, func.endEA)
return funcs
def get_func_sequences(ea):
funcs_bodylist = {}
funcs = get_funcs(ea)
for funcname in funcs:
if funcname not in funcs_bodylist:
funcs_bodylist[funcname] = []
for start, end in funcs[funcname]:
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
funcs_bodylist[funcname].append(opcode)
inst_addr = NextHead(inst_addr)
return funcs_bodylist
def get_func_cfgs(ea):
func_cfglist = {}
i = 0
start, end = get_section('LOAD')
# print start, end
for funcea in Functions(SegStart(ea)):
if start <= funcea <= end:
funcname = GetFunctionName(funcea)
func = get_func(funcea)
print(i)
i += 1
try:
icfg = cfg.cfg_construct(func)
func_cfglist[funcname] = icfg
except:
pass
return func_cfglist
def get_section(t):
base = SegByName(t)
start = SegByBase(base)
end = SegEnd(start)
return start, end
def get_func_cfg_sequences(func_cfglist):
func_cfg_seqlist = {}
for funcname in func_cfglist:
func_cfg_seqlist[funcname] = {}
cfg = func_cfglist[funcname][0]
for start, end in cfg:
codesq = get_sequences(start, end)
func_cfg_seqlist[funcname][(start, end)] = codesq
return func_cfg_seqlist
def get_sequences(start, end):
seq = []
inst_addr = start
while inst_addr <= end:
opcode = GetMnem(inst_addr)
seq.append(opcode)
inst_addr = NextHead(inst_addr)
return seq
def get_stack_arg(func_addr):
print(func_addr)
args = []
stack = GetFrame(func_addr)
if not stack:
return []
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <= lastM:
mName = GetMemberName(stack, i)
mSize = GetMemberSize(stack, i)
if mSize:
i = i + mSize
else:
i = i + 4
if mName not in args and mName and ' s' not in mName and ' r' not in mName:
args.append(mName)
return args
# pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
def processDataSegs():
funcdata = {}
datafunc = {}
for n in xrange(idaapi.get_segm_qty()):
seg = idaapi.getnseg(n)
ea = seg.startEA
segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
start = idc.SegStart(ea)
end = idc.SegEnd(ea)
cur = start
while cur <= end:
refs = [v for v in DataRefsTo(cur)]
for fea in refs:
name = GetFunctionName(fea)
if len(name) == 0:
continue
if name not in funcdata:
funcdata[name] = [cur]
else:
funcdata[name].append(cur)
if cur not in datafunc:
datafunc[cur] = [name]
else:
datafunc[cur].append(name)
cur = NextHead(cur)
return funcdata, datafunc
def obtainDataRefs(callgraph):
datarefs = {}
funcdata, datafunc = processDataSegs()
for node in callgraph:
if node in funcdata:
datas = funcdata[node]
for dd in datas:
refs = datafunc[dd]
refs = list(set(refs))
if node in datarefs:
print(refs)
datarefs[node] += refs
datarefs[node] = list(set(datarefs[node]))
else:
datarefs[node] = refs
return datarefs

View File

@ -16,9 +16,7 @@ from raw_graphs import *
#from discovRe_feature.discovRe import *
from discovRe import *
sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
#import wingdbstub
#wingdbstub.Ensure()

View File

@ -119,24 +119,23 @@ def getIncommingCalls(func):
def get_stackVariables(func_addr):
#print func_addr
args = []
stack = GetFrame(func_addr)
if not stack:
return 0
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <=lastM:
mName = GetMemberName(stack,i)
mSize = GetMemberSize(stack,i)
if mSize:
i = i + mSize
else:
i = i+4
if mName not in args and mName and 'var_' in mName:
args.append(mName)
return len(args)
args = []
stack = GetFrame(func_addr)
if not stack:
return 0
firstM = GetFirstMember(stack)
lastM = GetLastMember(stack)
i = firstM
while i <= lastM:
mName = GetMemberName(stack, i)
mSize = GetMemberSize(stack, i)
if mSize:
i = i + mSize
else:
i = i + 4
if mName not in args and mName and 'var_' in mName:
args.append(mName)
return len(args)
# 计算算数指令数量

View File

@ -0,0 +1,26 @@
import logging
import os
def setup_logger(name, log_file, level=logging.INFO):
"""Function setup as many loggers as you want"""
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
handler = logging.FileHandler(log_file)
handler.setFormatter(formatter)
# Also add a stream handler for console output
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger = logging.getLogger(name)
logger.setLevel(level)
logger.addHandler(handler)
logger.addHandler(stream_handler)
# If the file already exists, clear its contents to start fresh
if os.path.exists(log_file):
open(log_file, 'w').close()
return logger

View File

@ -1,73 +1,54 @@
# coding=utf-8
import os
# -*- coding: UTF-8 -*-
import pickle
import idc
import idaapi
from func import *
from idc import *
import os
# 定义常量
DATA_DIR = "D:\\bishe\\dataset"
INFECTED_DIR = os.path.join(DATA_DIR, "infected")
BENIGN_DIR = os.path.join(DATA_DIR, "benign")
CFG_EXTENSION = ".ida"
GDL_EXTENSION = ".dot"
ASM_EXTENSION = ".asm"
def preprocess(binary_name, workflow):
cfg_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
f"{binary_name}{CFG_EXTENSION}"
)
gdl_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
f"{binary_name}{GDL_EXTENSION}"
)
asm_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
f"{binary_name}{ASM_EXTENSION}"
)
def preprocess():
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
# print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
# print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
# print idc.ARGV[2]
# print type(idc.ARGV[2])
if os.path.exists(cfg_path):
idc.Exit(0)
binary_name = idc.GetInputFile()
workflow = idc.ARGV[1]
# workflow为特定值时分析良性软件否则分析恶意软件
if workflow == '-1':
cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
else:
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
idaapi.autoWait()
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
idaapi.autoWait()
# 生成CFG
generate_cfg(binary_name, cfg_path)
# 生成pe文件的cfg列表
# cfgs = get_func_cfgs_c(FirstSeg())
# 将cfg保存为.ida
# pickle.dump(cfgs, open(cfg_path, 'w'))
# 生成GDL
generate_gdl(gdl_path)
# 生成pe文件的fcg保存为.dot文件
# idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件网上几乎找不到gdl这个格式
# idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
# 生成ASM
generate_asm(asm_path)
# 关闭IDA Pro
idc.Exit(0)
def generate_cfg(binary_name, cfg_path):
cfgs = get_func_cfgs_c(FirstSeg())
with open(cfg_path, 'wb') as cfg_file:
pickle.dump(cfgs, cfg_file)
def generate_gdl(gdl_path):
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
def generate_asm(asm_path):
# 生成.asm文件
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
# 主函数
def main():
binary_name = idc.GetInputFile()
try:
workflow = idc.ARGV[1]
except IndexError:
print("Workflow argument not provided.")
return
preprocess(binary_name, workflow)
# 关闭IDA Pro
idc.Exit(0)
# 如果是作为IDA Pro的脚本运行调用主函数
if __name__ == "__main__":
main()
# 通用命令行格式 idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path完整命令行如下
# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
if __name__ == '__main__':
preprocess()

View File

@ -1,101 +0,0 @@
# -*- coding: UTF-8 -*-
import sys
from matplotlib import pyplot as plt
import networkx as nx
import pickle
# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
def print_obj(obj):
# "打印对象的所有属性"
print(obj.__dict__)
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。
# sub_166C4 393
if __name__ == '__main__':
testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
fr = open(testpath, 'r')
data = pickle.load(fr) #一个二进制文件的acfgs
fr.close()
# print(type(data1))
# print_obj(data1)
# print data1.raw_graph_list[393]
# print_obj(data1.raw_graph_list[393])
# nx.draw(data1.raw_graph_list[393].g,with_labels=True)
# plt.show()
print("一个二进制文件的所有函数的原始特征list。")
print_obj(data) # acfg list
print("\n")
print("一个函数的原始特征由old_gdiscovRe方法的ACFGgGenius方法的ACFGfun_feature表示函数级别的特征的向量三部分构成")
print_obj(data.raw_graph_list[0]) # 一个函数的acfg
print("其中fun_features = 函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
# feature = data.raw_graph_list[0].fun_features
print("old_g:{}".format(data.raw_graph_list[0].old_g))
print("g:{}".format(data.raw_graph_list[0].g))
# G = data1.raw_graph_list[393].old_g
# print G.node[0] # G.node[i]是dict
# for key, value in G.node[0].items():
# print('{key}:{value}'.format(key=key, value=value))
# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
G = data.raw_graph_list[0].g
print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
# print(G.node[0])
# print("\n")
# 函数内所有基本快的特征
for key, value in G.node.items():
print('{}:{}'.format(key, value))
#oldg就是读取IDA的CFG所以数量、方向等都一样g根据old_g生成也一样
#old g
G = data.raw_graph_list[0].old_g
nx.draw(G, with_labels=True)
#plt.title('old_g')
plt.show()
# g
G = data.raw_graph_list[0].g
nx.draw(G, with_labels=True)
#plt.title('Genius_g')
plt.show()
# draw graph with labels
pos = nx.spring_layout(G)
nx.draw(G, pos)
node_labels = nx.get_node_attributes(G, 'v') #networkx的node由属性。g的属性为'v'意为原始特征的vector。old_g的属性见cfg_constructor.py
nx.draw_networkx_labels(G, pos, labels=node_labels)
#plt.title('Genius_g with raw feature vector')
plt.show()
# 1 function calls本函数的函数调用指令call jal jalr数量。。注意arm中没有这些指令
# 2 logic instructions 本函数的逻辑运算指令数量。如and、or的数量
# 3 TransferIns 转移指令如jmp arm中为mov数量
# 4 LocalVariables 局部变量数量
# 5 BB basicblocks数量
# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征这里暂时不管
# 7 IncommingCalls调用本函数的指令数量
# 8 Intrs 指令数量
# 9 between 结构特征中的betweeness。
# 10 strings 字符串
# 11 consts 数字常量

View File

@ -0,0 +1,83 @@
# coding=utf-8
import os
import subprocess
import threading
import time
from log_utils import setup_logger
# 设置最大并发线程数
max_threads = 20
# 创建一个锁和条件变量用于线程同步
thread_lock = threading.Lock()
active_threads = 0
threads_completed = 0
condition = threading.Condition(thread_lock)
timer_event = threading.Event()
def execute_command(cmd, log):
"""
在子线程中执行给定的命令
"""
global active_threads, threads_completed
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
with condition:
if stdout:
log.info("stdout: %s" % stdout.decode('gbk'))
if stderr:
log.warning("stderr: %s\n err_cmd: %cmd" % (stderr.decode('gbk'), cmd))
# 当前线程完成任务后释放一个线程位置
active_threads -= 1
threads_completed += 1
condition.notify_all()
print threads_completed
def timer_thread():
print 'start timer thread'
while True:
with condition:
# 每隔1秒检查一次并输出已完成命令的数量
while not timer_event.is_set() and threads_completed < len(commands):
timer_event.wait(1)
print "done file: %d" % threads_completed
# 如果所有命令都已完成,则停止计时器线程
if threads_completed == len(commands):
timer_event.set()
break
if __name__ == '__main__':
# timer = threading.Thread(target=timer_thread)
# timer.start()
# timer_event.clear()
log = setup_logger('thread_out', 'ida_asm_create_out.log')
# 样本文件夹
sample_dir = "D:/bishe/dataset/sample_20230130_458"
# 创建并启动线程
commands = []
for file in os.listdir(sample_dir):
com = r'D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out '
commands.append(com + "D:/bishe/dataset/sample_20230130_458/" + file)
threads = []
for cmd in commands:
while active_threads >= max_threads:
with condition:
# 等待有线程完成任务
condition.wait()
thread = threading.Thread(target=execute_command, args=(cmd, log))
thread.start()
active_threads += 1
threads.append(thread)
# 等待所有线程完成
for thread in threads:
thread.join()
print("所有命令已执行完毕.")

View File

@ -1,356 +0,0 @@
import cPickle as pickle
from search import *
from nearpy import Engine
from nearpy.hashes import RandomDiscretizedProjections
from nearpy.filters import NearestFilter, UniqueFilter
from nearpy.distances import EuclideanDistance
from nearpy.distances import CosineDistance
from nearpy.hashes import RandomBinaryProjections
from nearpy.experiments import DistanceRatioExperiment
from redis import Redis
from nearpy.storage import RedisStorage
from feature import *
import numpy as np
import os
import pdb
import argparse
import time
import numpy as np
from refactoring import *
import pymongo
from pymongo import MongoClient
def initDB():
client = MongoClient()
client = MongoClient('localhost', 27017)
client = MongoClient('mongodb://localhost:27017/')
db = client.test_database
db = client['iot-encoding']
return db
db = initDB()
posts = db.posts
class db:
def __init__(self):
self.feature_list = {}
self.engine = None
def loadHashmap(self, feature_size, result_n):
# Create redis storage adapter
redis_object = Redis(host='localhost', port=6379, db=0)
redis_storage = RedisStorage(redis_object)
pdb.set_trace()
try:
# Get hash config from redis
config = redis_storage.load_hash_configuration('test')
# Config is existing, create hash with None parameters
lshash = RandomBinaryProjections(None, None)
# Apply configuration loaded from redis
lshash.apply_config(config)
except:
# Config is not existing, create hash from scratch, with 10 projections
lshash = RandomBinaryProjections('test', 0)
# Create engine for feature space of 100 dimensions and use our hash.
# This will set the dimension of the lshash only the first time, not when
# using the configuration loaded from redis. Use redis storage to store
# buckets.
nearest = NearestFilter(1000)
#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
pdb.set_trace()
self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
# Do some stuff like indexing or querying with the engine...
# Finally store hash configuration in redis for later use
redis_storage.store_hash_configuration(lshash)
def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
if fvector is None:
return
#ftuple = tuple([fvector])
self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
def batch_appendDB(self, binary_name, features, firmware_name=""):
for funcname in features:
feature = features[funcname]
#pdb.set_trace()
self.appendToDB(binary_name, funcname, feature, firmware_name)
def batch_appendDBbyDir(self, base_dir):
cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
i = 0
for v in cursor:
print i
i+=1
binary_name = v['binary_name']
funcname = v['func_name']
firmware_name = v['firmware_name']
feature = v['fvector']
self.appendToDB(binary_name, funcname, feature, firmware_name)
def batch_appendDBbyDir1(self, base_dir):
image_dir = os.path.join(base_dir, "image")
firmware_featrues={}
bnum = 0
fnum = 0
i = 0
pdb.set_trace()
for firmware_name in os.listdir(image_dir):
print firmware_name
firmware_featrues[firmware_name] = {}
firmware_dir = os.path.join(image_dir, firmware_name)
for binary_name in os.listdir(firmware_dir):
if binary_name.endswith(".features"):
bnum += 1
featrues_dir = os.path.join(firmware_dir, binary_name)
featrues = pickle.load(open(featrues_dir, "r"))
for funcname in featrues:
fnum +=1
#pdb.set_trace()
feature = featrues[funcname]
self.appendToDB(binary_name, funcname, feature, firmware_name)
del featrues
print("bnum ", bnum)
print("fnum ", fnum)
def dump(self, base_dir):
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
pickle.dump(self.feature_list, open(db_dir, 'w'))
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
pickle.dump(self.engine, open(db_dir, 'w'))
def loadDB(self, base_dir):
db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
self.feature_list = pickle.load(open(db_dir, 'r'))
db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
self.engine = pickle.load(open(db_dir, 'r'))
def findF(self, binary_name, funcname):
x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
return x[0]
def retrieveFeaturesByDir(n, base_dir):
firmware_featrues={}
i = 0
for firmware_name in os.listdir(base_dir):
if firmware_name.endWith(".features"):
firmware_featrues[firmware_name] = {}
firmware_dir = os.path.join(base_dir, firmware_name)
if i > 0:
break
i += 1
pdb.set_trace()
for binary_name in os.listdir(firmware_dir):
featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
for funcname in featrues:
feature = featrues[funcname]
self.appendToDB(firmware_name, binary_name, funcname, feature)
del featrues
def retrieveFeatures(n, base_dir, filename, funcs):
feature_dic = {}
featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
#featuresx = retrieveFeaturesx(filename)
for name in featrues:
#if name in funcs:
x = featrues[name]
#+ featuresx[name]
feature_dic[name] = np.asarray(x)
return feature_dic
def retrieveVuldb(base_input_dir):
vul_path = os.path.join(base_input_dir, "vul")
vul_db = pickle.load(open(vul_path, "r"))
return vul_db
def retrieveFeaturesx(filename):
ida_input_dir = os.path.join("./data/", filename + ".features")
featuresx = pickle.load(open(ida_input_dir, "r"))
return featuresx
def retrieveQueries(n, base_dir, filename1, featrues_src):
queries = {}
featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
#featuresx = retrieveFeaturesx(filename1)
for name in featrues:
#if name in featrues_src:
x = featrues[name]
#+ featuresx[name]
queries[name] = np.asarray(x)
return queries
def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
queries = {}
featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
for name in featrues:
#del featrues[name][5]
queries[name] = np.asarray(featrues[name])
return queries
def retrieveQuery(n, base_dir, filename, funcname):
featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
featrues = pickle.load(open(featrues_dir, "r"))
f = [featrues[v] for v in featrues if funcname in v ][0]
return np.asarray(f)
def parse_command():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
parser.add_argument('--output_dir', type=str, help="output dir")
parser.add_argument("--filename1", type=str, help="the size of each graphlet")
parser.add_argument("--filename2", type=str, help="the size of each graphlet")
parser.add_argument("--size", type=int, help="the size of each graphlet")
#parser.add_argument("--size", type=int, help="the size of each graphlet")
args = parser.parse_args()
return args
def loadFuncs(path):
funcs = {}
x86_dir = os.path.join(path, "func_candid")
#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
fp = open(x86_dir,"r")
for line in fp:
items = line.split("\n")
funcname = items[0]
funcs[funcname] = 1
return funcs
def dump(path, featrues, queries):
fp = open(path + "/" + "matrix", 'w')
for name in featrues:
row = []
row.append("x86")
row.append(name)
row += featrues[name]
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
for name in queries:
row = []
row.append("mips")
row.append(name)
row += queries[name]
fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
fp.close()
def queryBytwo(base_input_dir, filename1, filename2, n):
threthold = 50
db_instance = db()
funcs = loadFuncs(base_input_dir)
db_instance.loadHashmap(n, 50000)
#pdb.set_trace()
featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
queries = retrieveQueries(n, base_input_dir, filename2, funcs)
#queries = refactoring(queries, featrues)
vul_db = retrieveVuldb(base_input_dir)
pdb.set_trace()
#dump(base_input_dir, featrues, queries)
#start = time.time()
#db_instance.batch_appendDBbyDir(base_input_dir)
#end = time.time()
#total = end - start
#print total
db_instance.batch_appendDB(filename1, featrues)
pdb.set_trace()
ranks = []
times = []
for threthold in xrange(1, 210, 10):
hit = []
i = 0
for name in queries:
#print i
i += 1
'''
if i == 1000:
print (sum(times)/len(times))
pdb.set_trace()
print "s"
'''
#if name not in vul_db['openssl']:
# continue
if name not in featrues:
continue
#pdb.set_trace()
query = queries[name]
#start = time.time()
x = db_instance.engine.neighbours(query)
#end = time.time()
#total = end - start
#times.append(total)
#print total
#pdb.set_trace()
try:
rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
ranks.append((name, rank))
if rank <= threthold:
hit.append(1)
else:
hit.append(0)
except:
#pdb.set_trace()
hit.append(0)
pass
#pdb.set_trace()
acc = sum(hit) * 1.0 / len(hit)
print acc
def queryAll(base_dir, firmware_name, filename1, n):
threthold = 155
db_instance = db()
db_instance.loadHashmap(n, 50000)
queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
start = time.time()
pdb.set_trace()
db_instance.batch_appendDBbyDir(n, base_dir)
end = time.time()
dur = end - start
print dur
pdb.set_trace()
hit = []
i = 0
times = []
for name in queries:
print i
i += 1
query = queries[name]
start = time.clock()
x = db_instance.engine.neighbours(query)
end = time.clock()
dur = end - start
times.append(dur)
#pdb.set_trace()
try:
rank = [v for v in xrange(len(x)) if name in x[v][1]]
if len(rank) > 1:
pdb.set_trace()
print "stop"
if rank[0] <= threthold:
hit.append(1)
else:
hit.append(0)
except:
hit.append(0)
acc = sum(hit) * 1.0 / len(hit)
mean = np.mean(times)
std = np.std(times)
#pdb.set_trace()
print acc
if __name__ == "__main__":
args = parse_command()
base_dir = args.base_input_dir
filename1 = args.filename1
filename2 = args.filename2
n = args.size
pdb.set_trace()
queryBytwo(base_dir, filename1, filename2, n)

View File

@ -1,16 +0,0 @@
@echo off
setlocal EnableDelayedExpansion
set "FOLDER_PATH=D:\bishe\dataset\train_malware"
for %%f in ("%FOLDER_PATH%\*") do (
echo !time! %%f
D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
)
endlocal