247 lines
12 KiB
Python
247 lines
12 KiB
Python
import angr
|
||
import csv
|
||
from angrutils import plot_cfg, hook0, set_plot_style
|
||
import bingraphvis
|
||
import networkx as nx
|
||
import os
|
||
from tqdm import tqdm
|
||
import sys
|
||
# sys.path.append(r'../ASM2VEC_plus_scripts/')
|
||
# from func2vec import func2vec,load_model
|
||
# from node_feature import asm2vec_plus
|
||
import lief
|
||
from detect_pe_packer import detect_pack_res
|
||
# exit()
|
||
from node_feature import *
|
||
|
||
def my_round(vec_list):
|
||
for i in range(len(vec_list)):
|
||
vec_list[i] = round(vec_list[i], 5)
|
||
return vec_list
|
||
|
||
def get_node_feature(hex_asm="558bec83ec085756bf0bb80000ff15d4804000",node_feature_method="n_gram"):
|
||
if node_feature_method =="asm2vec_plus_16":
|
||
return asm2vec_plus_16(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_plus_32":
|
||
return asm2vec_plus_32(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_plus_64":
|
||
return asm2vec_plus_64(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_plus_128":
|
||
return asm2vec_plus_128(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_plus_256":
|
||
return asm2vec_plus_256(hex_asm=hex_asm).tolist()
|
||
|
||
elif node_feature_method =="asm2vec_base_16":
|
||
return asm2vec_base_16(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_base_32":
|
||
return asm2vec_base_32(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_base_64":
|
||
return asm2vec_base_64(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_base_128":
|
||
return asm2vec_base_128(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_base_256":
|
||
return asm2vec_base_256(hex_asm=hex_asm).tolist()
|
||
|
||
elif node_feature_method == "asm2vec_s_base_16":
|
||
return asm2vec_s_base_16(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s_base_32":
|
||
return asm2vec_s_base_32(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s_base_64":
|
||
return asm2vec_s_base_64(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s_base_128":
|
||
return asm2vec_s_base_128(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s_base_256":
|
||
return asm2vec_s_base_256(hex_asm=hex_asm).tolist()
|
||
|
||
elif node_feature_method == "asm2vec_s368_base_16":
|
||
return asm2vec_s368_base_16(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s368_base_32":
|
||
return asm2vec_s368_base_32(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s368_base_64":
|
||
return asm2vec_s368_base_64(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s368_base_128":
|
||
return asm2vec_s368_base_128(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method == "asm2vec_s368_base_256":
|
||
return asm2vec_s368_base_256(hex_asm=hex_asm).tolist()
|
||
|
||
elif node_feature_method =="malconv":
|
||
return malconv(hex_asm=hex_asm)
|
||
elif node_feature_method =="n_gram":
|
||
return n_gram(hex_asm=hex_asm)
|
||
elif node_feature_method =="word_frequency":
|
||
return word_frequency(hex_asm=hex_asm)
|
||
elif node_feature_method =="asm2vec_plus_small":
|
||
return asm2vec_plus_small(hex_asm=hex_asm).tolist()
|
||
elif node_feature_method =="asm2vec_base_small":
|
||
return asm2vec_base_small(hex_asm=hex_asm).tolist()
|
||
|
||
asm2vec_model_path="../ASM2VEC_plus_scripts/asm2vec_checkpoints/model_100.pt"
|
||
|
||
# print(bengin_list)
|
||
|
||
# jmp_family=["jmp","call","ret","retf",
|
||
# "ja","jnbe","jae","jnb","jb","jane","jbe","jna",
|
||
# "jg","jnle","jge","jnl","jl","jnge","jle","jng",
|
||
# "je","jz","jne","jnz","jc","jnc","jno","jnp","jpo",
|
||
# "jns","jo","jp","jpe","js"
|
||
# "loop","loope","loopz","loopne","loopnz","jcxz","jecxz"
|
||
# ]
|
||
|
||
|
||
|
||
def get_new_section_addr(bin_parse):
|
||
entry_point = bin_parse.optional_header.addressof_entrypoint
|
||
# 找到入口点所在的section名字
|
||
entryname = bin_parse.section_from_rva(entry_point).name
|
||
for section in bin_parse.sections:
|
||
if section.name == entryname:
|
||
text_characteristics = section.characteristics
|
||
virtual_address = section.virtual_address
|
||
virtual_size = section.size
|
||
virtual_offset = section.offset
|
||
# 创建新段,并设置偏移位置
|
||
new_section = lief.PE.Section("test")
|
||
if virtual_size % 0x1000 == 0:
|
||
mod_num = int(virtual_size / 0x1000)
|
||
else:
|
||
mod_num = int(virtual_size / 0x1000) + 1
|
||
new_section.virtual_address = virtual_address + mod_num * 0x1000
|
||
# print(hex(new_section.virtual_address))
|
||
# exit()
|
||
# print(hex(new_section.virtual_address + bin_parse.imagebase))
|
||
|
||
new_section.offset = virtual_offset + virtual_size
|
||
return new_section.offset
|
||
|
||
|
||
def cfg_extract(file_list,data_dir="../data/malware",CFG_dir="../CFG_data/malware",csv_save_path="../CFG_data/malware_msg.csv",header = ['malware_name','nodes_num','edgs_num']):
|
||
|
||
csv_data=[]
|
||
#载入asm2vec的模型
|
||
# asm2vec_model=load_model(asm2vec_model_path)
|
||
for i in tqdm(range(len(file_list))):
|
||
# for i in tqdm(range(0,50)):
|
||
file_item=file_list[i]
|
||
|
||
#剔除加壳程序
|
||
|
||
|
||
try:
|
||
if detect_pack_res(os.path.join(data_dir, file_item)) == True:
|
||
continue
|
||
bin_parse = lief.PE.parse(os.path.join(data_dir,file_item))
|
||
p = angr.Project(os.path.join(data_dir,file_item), load_options={'auto_load_libs': False})
|
||
cfg = p.analyses.CFGFast(show_progressbar=True, normalize=False, resolve_indirect_jumps=False,
|
||
force_smart_scan=False, symbols=False, data_references=False)
|
||
# new_section_addr = get_new_section_addr(bin_parse)
|
||
except:
|
||
continue
|
||
|
||
|
||
G = cfg.graph
|
||
|
||
#如果反汇编有出错,则当前这个文件舍弃
|
||
flag=1
|
||
#为每个节点设置相关信息
|
||
insert_point_count=0
|
||
for e,dict in G.nodes.items():
|
||
|
||
try:
|
||
asm_hex=e.block.bytes.hex().replace("0x","")
|
||
#不能正确生成vector的函数不要
|
||
# dict["feature"] = str(vec)
|
||
#判断是否有插入点
|
||
# dict["has_insertPoint"] = False
|
||
#该basic_block的bytes指令
|
||
|
||
dict["asm2vec_plus_16"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_plus_16")))
|
||
except Exception as ea:
|
||
flag = 0
|
||
break
|
||
|
||
dict["asm2vec_plus_32"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_plus_32")))
|
||
dict["asm2vec_plus_64"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_plus_64")))
|
||
dict["asm2vec_plus_128"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_plus_128")))
|
||
dict["asm2vec_plus_256"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_plus_256")))
|
||
|
||
dict["asm2vec_base_16"] = str(my_round(get_node_feature(hex_asm=asm_hex,node_feature_method="asm2vec_base_16")))
|
||
dict["asm2vec_base_32"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_base_32")))
|
||
dict["asm2vec_base_64"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_base_64")))
|
||
dict["asm2vec_base_128"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_base_128")))
|
||
dict["asm2vec_base_256"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_base_256")))
|
||
|
||
dict["asm2vec_s_base_16"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s_base_16")))
|
||
dict["asm2vec_s_base_32"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s_base_32")))
|
||
dict["asm2vec_s_base_64"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s_base_64")))
|
||
dict["asm2vec_s_base_128"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s_base_128")))
|
||
dict["asm2vec_s_base_256"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s_base_256")))
|
||
|
||
dict["asm2vec_s368_base_16"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s368_base_16")))
|
||
dict["asm2vec_s368_base_32"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s368_base_32")))
|
||
dict["asm2vec_s368_base_64"] = str( my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s368_base_64")))
|
||
dict["asm2vec_s368_base_128"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s368_base_128")))
|
||
dict["asm2vec_s368_base_256"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="asm2vec_s368_base_256")))
|
||
|
||
dict["n_gram"] = str(my_round(get_node_feature(hex_asm=asm_hex, node_feature_method="n_gram")))
|
||
dict["malconv"]= str(my_round(get_node_feature(hex_asm=asm_hex,node_feature_method="malconv")))
|
||
dict["word_frequency"]= str(my_round(get_node_feature(hex_asm=asm_hex,node_feature_method="word_frequency")))
|
||
# dict["bytes"]=asm_hex
|
||
|
||
#用于记录当前basicblock插入func后,该func的地址值,初始值是原始程序的新建section的地址
|
||
|
||
|
||
if flag==1:
|
||
print("成功写入",os.path.join(CFG_dir,file_item)+".gexf")
|
||
nx.write_gexf(G, os.path.join(CFG_dir,file_item)+".gexf")
|
||
#写入节点数量与边数量信息
|
||
nodes_num=len(cfg.graph.nodes())
|
||
edgs_num=len(cfg.graph.edges())
|
||
csv_data.append([file_item+".gexf",str(nodes_num),str(edgs_num)])
|
||
|
||
with open(csv_save_path,'w',encoding='utf-8',newline='') as fp:
|
||
writer =csv.writer(fp)
|
||
writer.writerow(header)
|
||
writer.writerows(csv_data)
|
||
|
||
#用于直接读取gexf信息保存的代码
|
||
def write_csv(CFG_dir = "../CFG_data/malware",csv_save_path="../CFG_data/malware_msg.csv",header = ['malware_name','nodes_num','edgs_num']):
|
||
file_list=os.listdir(CFG_dir)
|
||
# benign_list=os.listdir(bengin_CFG_dir)
|
||
csv_data = []
|
||
|
||
for item_name in tqdm(file_list):
|
||
file=os.path.join(CFG_dir,item_name)
|
||
G=nx.read_gexf(file)
|
||
nodes_num = len(G.nodes())
|
||
edgs_num = len(G.edges())
|
||
csv_data.append([item_name, str(nodes_num), str(edgs_num)])
|
||
|
||
with open(csv_save_path,'w',encoding='utf-8',newline='') as fp:
|
||
writer =csv.writer(fp)
|
||
writer.writerow(header)
|
||
writer.writerows(csv_data)
|
||
print("成功结束")
|
||
# pass
|
||
|
||
if __name__ == '__main__':
|
||
malware_data_dir = "../data/malware"
|
||
bengin_data_dir = "../data/benign"
|
||
# pack_data_dir= "../data/pack"
|
||
|
||
malware_CFG_dir = "../cfg_data_with_feature/malware"
|
||
bengin_CFG_dir = "../cfg_data_with_feature/benign"
|
||
# pack_CFG_dir = "../cfg_data/pack"
|
||
|
||
malware_list = os.listdir(malware_data_dir)
|
||
bengin_list = os.listdir(bengin_data_dir)
|
||
# pack_list = os.listdir(pack_data_dir)
|
||
# print(pack_list)
|
||
# exit()
|
||
#
|
||
# cfg_extract(pack_list, data_dir=pack_data_dir, CFG_dir=pack_CFG_dir,
|
||
# csv_save_path="../cfg_data/pack_msg.csv", header=['bengin_name', 'nodes_num', 'edgs_num'])
|
||
|
||
# cfg_extract(malware_list,data_dir=malware_data_dir,CFG_dir=malware_CFG_dir,csv_save_path="../cfg_data_with_feature/malware_msg1.csv",header = ['malware_name','nodes_num','edgs_num'])
|
||
cfg_extract(bengin_list, data_dir=bengin_data_dir ,CFG_dir=bengin_CFG_dir ,csv_save_path="../cfg_data_with_feature/benign_msg.csv", header = ['bengin_name', 'nodes_num', 'edgs_num'])
|
||
# write_csv(CFG_dir = "../CFG_data/malware_asm2vec_base",csv_save_path="../CFG_data/malware_msg_asm2vec_base.csv",header = ['malware_name','nodes_num','edgs_num','insert_point_count'])
|
||
# write_csv(CFG_dir="../CFG_data/benign_asm2vec_base", csv_save_path="../CFG_data/benign_msg_asm2vec_base.csv",header=['benign_name', 'nodes_num', 'edgs_num','insert_point_count']) |