Complete Raw-feature-extractor

Complete the reproduction of the Raw-feature-extractor:

The purpose of read_idaFILE.py is to read the raw-feature from the generated .ida file and display
This commit is contained in:
Erio 2021-11-19 16:29:15 +08:00
parent e29e36aa32
commit 17c1ac88b1
11 changed files with 125 additions and 221743 deletions

File diff suppressed because it is too large Load Diff

View File

@ -126,7 +126,7 @@ def get_func_cfgs_c(ea):
i += 1 i += 1
icfg = cfg.getCfg(func, externs_eas, ea_externs) icfg = cfg.getCfg(func, externs_eas, ea_externs)
func_f = get_discoverRe_feature(func, icfg[0]) func_f = get_discoverRe_feature(func, icfg[0])
raw_g = raw_graph(funcname, icfg, func_f) #生成一个rawcfg。raw_graph是一个python class定义在 raw_graph.py raw_g = raw_graph(funcname, icfg, func_f) #生成一个rawcfg。raw_graph是一个python class定义在 raw_graph.py.包含g本文的ACFG、olg_gdiscovRe的acfg、feature函数级别的一些特征以及betweenness
raw_cfgs.append(raw_g) # raw_graphs 是另一个python class存储raw_graph的list。定义在 raw_graph.py raw_cfgs.append(raw_g) # raw_graphs 是另一个python class存储raw_graph的list。定义在 raw_graph.py
#print(raw_g.__dict__) #print(raw_g.__dict__)
#print(raw_g) 由于raw_graph、raw_graphs都是class直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__) #print(raw_g) 由于raw_graph、raw_graphs都是class直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>,不能打印对象的属性。 #https://blog.51cto.com/steed/2046408 print_obj、 print(obj.__dict__)

View File

@ -1,9 +1,11 @@
# -*- coding: UTF-8 -*-
import networkx as nx import networkx as nx
import pdb import pdb
def betweeness(g): def betweeness(g):
#pdb.set_trace() #pdb.set_trace()
betweenness = nx.betweenness_centrality(g) betweenness = nx.betweenness_centrality(g)
return betweenness #print betweenness
return betweenness #list
def eigenvector(g): def eigenvector(g):
centrality = nx.eigenvector_centrality(g) centrality = nx.eigenvector_centrality(g)
@ -13,7 +15,7 @@ def closeness_centrality(g):
closeness = nx.closeness_centrality(g) closeness = nx.closeness_centrality(g)
return closeness return closeness
def retrieveGP(g): def retrieveGP(g): #list转化为float。将基本块级别的betweeness转化为函数级别的betweeness
bf = betweeness(g) bf = betweeness(g)
#close = closeness_centrality(g) #close = closeness_centrality(g)
#bf_sim = #bf_sim =

View File

@ -19,10 +19,16 @@ def parse_command():
return args return args
if __name__ == '__main__': if __name__ == '__main__':
#E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
#print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py'] #print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
#print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\new'] #print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
#print idc.ARGV[2] #print idc.ARGV[2]
#print type(idc.ARGV[2]) #print type(idc.ARGV[2])
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius4\acfgs" hpcenter
#测试生成原始特征的时间。
start_t = time.clock()
args = parse_command() args = parse_command()
#path = args.path #path = args.path
path = idc.ARGV[2] path = idc.ARGV[2]
@ -32,6 +38,11 @@ if __name__ == '__main__':
idc.SetShortPrm(idc.INF_START_AF, analysis_flags) idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
idaapi.autoWait() idaapi.autoWait()
cfgs = get_func_cfgs_c(FirstSeg()) cfgs = get_func_cfgs_c(FirstSeg())
end_t = time.clock()
print (end_t - start_t) #1.5934438s hpcenter 83.4 KB #35.6745299s SCGDW698 5.5mb #14.1480888s 762kb SCMQTTIot 这个时间包括ida分析二进制文件的时间和脚本生成对应原始特征的时间
# 应该是随着函数和基本块的数量增加而线性增加的先不写了。可能ida分析二进制文件的占比比较高
binary_name = idc.GetInputFile() + '.ida' binary_name = idc.GetInputFile() + '.ida'
print path print path
print binary_name print binary_name
@ -39,13 +50,7 @@ if __name__ == '__main__':
pickle.dump(cfgs, open(fullpath,'w')) pickle.dump(cfgs, open(fullpath,'w'))
#print binary_name #print binary_name
testpath="C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
fr = open(fullpath,'r')
data1 = pickle.load(fr) #加上这句脚本执行完就退出IDA
print(type(data1)) #<type 'instance'>
print(data1.raw_graph_list[393].__dict__)
print(data1.raw_graph_list[393].g)
print(data1.raw_graph_list[393].g.nodes())
#print_obj(data1)
#print cfgs.raw_graph_list[0]
#idc.Exit(0) #idc.Exit(0)

Binary file not shown.

View File

@ -1,7 +1,10 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import itertools import itertools
import sys import sys
sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/') sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
sys.path.insert(1, 'C:/Python27/Lib/site-packages')
import networkx as nx import networkx as nx
#import numpy as np #import numpy as np
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
@ -72,10 +75,10 @@ class raw_graph:
insts = g.node[id_]['numIns'] insts = g.node[id_]['numIns']
feature_vec.append(insts) feature_vec.append(insts)
# of LIs6 # of LIs6
insts = g.node[id_]['numLIs'] insts = g.node[id_]['numLIs'] #
feature_vec.append(insts) feature_vec.append(insts)
# of TIs7 # of TIs7
insts = g.node[id_]['numTIs'] insts = g.node[id_]['numTIs'] #transfer instructions
feature_vec.append(insts) feature_vec.append(insts)
return feature_vec return feature_vec

View File

@ -0,0 +1,100 @@
# -*- coding: UTF-8 -*-
import sys
import sys
from matplotlib import pyplot as plt
sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
sys.path.insert(1, 'C:/Python27/Lib/site-packages')
import networkx as nx
def print_obj(obj):
"打印对象的所有属性"
print(obj.__dict__)
import pickle
#sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。
#sub_166C4 393
if __name__ == '__main__':
testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
fr = open(testpath, 'r')
data1 = pickle.load(fr) #一个二进制文件的acfgs
#print(type(data1))
#print_obj(data1)
#print data1.raw_graph_list[393]
#print_obj(data1.raw_graph_list[393])
#nx.draw(data1.raw_graph_list[393].g,with_labels=True)
#plt.show()
print "一个二进制文件的所有函数的原始特征list。"
print_obj(data1) #acfg list
print "\n"
print "一个函数的原始特征由old_gdiscovRe方法的ACFGgGenius方法的ACFGfun_feature表示函数级别的特征的向量三部分构成"
print_obj(data1.raw_graph_list[393]) #一个函数的acfg
print "\n"
feature=data1.raw_graph_list[393].fun_features
print "函数级别特征: # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts"
print feature
print "\n"
# G=data1.raw_graph_list[393].old_g
# print G.node[0] # G.node[i]是dict
# for key, value in G.node[0].items():
# print('{key}:{value}'.format(key=key, value=value))
# 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
G=data1.raw_graph_list[393].g
print "# 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量? #4'numAs' 算数指令如INC #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量"
print G.node[0]
print "\n"
# for key, value in G.node[0].items():
# print('{key}:{value}'.format(key=key, value=value))
#oldg就是读取IDA的CFG所以数量、方向等都一样g根据old_g生成也一样
#old g
G = data1.raw_graph_list[393].old_g
nx.draw(G,with_labels=True)
#plt.title('old_g')
plt.show()
# g
G = data1.raw_graph_list[393].g
nx.draw(G,with_labels=True)
#plt.title('Genius_g')
plt.show()
# draw graph with labels
pos = nx.spring_layout(G)
nx.draw(G, pos)
node_labels = nx.get_node_attributes(G, 'v') #networkx的node由属性。g的属性为'v'意为原始特征的vector。old_g的属性见cfg_constructor.py
nx.draw_networkx_labels(G, pos, labels=node_labels)
#plt.title('Genius_g with raw feature vector')
plt.show()
# 1 function calls本函数的函数调用指令call jal jalr数量。。注意arm中没有这些指令
# 2 logic instructions 本函数的逻辑运算指令数量。如and、or的数量
# 3 TransferIns 转移指令如jmp arm中为mov数量
# 4 LocalVariables 局部变量数量
# 5 BB basicblocks数量
# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征这里暂时不管
# 7 IncommingCalls调用本函数的指令数量
# 8 Intrs 指令数量
# 9 between 结构特征中的betweeness。
# 10 strings 字符串
# 11 consts 数字常量