Compare commits

..

No commits in common. "bd51d89a0b8a7f531512075604562259bc52b00b" and "5de6c4568c0d825b0f0a3ad2fc18e2eea3ff5646" have entirely different histories.

3 changed files with 72 additions and 105 deletions

View File

@ -1,18 +0,0 @@
@echo off
setlocal enabledelayedexpansion
set "IDA_PATH=D:\IDA_Pro_v6.8\idaq.exe"
set "FOLDER_PATH=D:\bishe\Gencoding\A2C"
set "SCRIPT_PATH=../raw-feature-extractor/preprocessing_ida.py"
set "SAVE_PATH=../store/"
set "LOG_PATH=../log/"
for %%f in ("%FOLDER_PATH%\*.exe") do (
echo !time! %%f
%IDA_PATH% -c -B -S"%SCRIPT_PATH% --path %SAVE_PATH%" %%f
)
endlocal

View File

@ -1,4 +1,3 @@
import idc
from func import * from func import *
from raw_graphs import * from raw_graphs import *
from idc import * from idc import *
@ -14,11 +13,8 @@ def parse_command():
if __name__ == '__main__': if __name__ == '__main__':
# def main_op(store_file_path):
args = parse_command() args = parse_command()
# path = os.path.join("../")
path = idc.ARGV[2] path = idc.ARGV[2]
print os.getcwd()
analysis_flags = idc.GetShortPrm(idc.INF_START_AF) analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF analysis_flags &= ~idc.AF_IMMOFF
# turn off "automatically make offset" heuristic # turn off "automatically make offset" heuristic
@ -29,6 +25,3 @@ if __name__ == '__main__':
fullpath = os.path.join(path, binary_name) fullpath = os.path.join(path, binary_name)
pickle.dump(cfgs, open(fullpath, 'w')) pickle.dump(cfgs, open(fullpath, 'w'))
idc.Exit(0) idc.Exit(0)

View File

@ -1,5 +1,4 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import os
import sys import sys
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
import networkx as nx import networkx as nx
@ -8,6 +7,11 @@ import hashlib
import json import json
def print_obj(obj):
"打印对象的所有属性"
print(obj.__dict__)
def calc_sha256(file_path): def calc_sha256(file_path):
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
bytes = f.read() bytes = f.read()
@ -21,85 +25,73 @@ import pickle
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。 # sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。
# sub_166C4 393 # sub_166C4 393
if __name__ == '__main__': if __name__ == '__main__':
file_name_list = os.listdir('../A2C/') file_path = '../3c580f5beca53b6599e5f04d3aa68a34bd50521d7ec5d7163849eb69f53a4150.exe'
res_file = "../sample.jsonl" testpath = '../store/3c580f5beca53b6599e5f04d3aa68a34bd50521d7ec5d7163849eb69f53a4150.exe.ida'
sample_file = open(res_file, mode='a') fr = open(testpath, 'r')
for file_name in file_name_list: data1 = pickle.load(fr) # 一个二进制文件的acfgs
print file_name # function_edges
file_path = '../A2C/' + file_name function_edge_start = []
testpath = '../store/' + file_name + '.ida' function_edge_end = []
if os.path.exists(testpath) and os.path.splitext(file_path)[-1].lower() == '.exe': for item in data1.raw_graph_list[0].old_g.edges:
fr = open(testpath, 'r') function_edge_start.append(item[0])
data1 = pickle.load(fr) # 一个二进制文件的acfgs function_edge_end.append(item[1])
# funtion num function_edges = [function_edge_start, function_edge_end]
function_number = len(data1.raw_graph_list) fun_name_temp = []
if function_number == 0: # function hsah
continue file_hash = calc_sha256(file_path)
# function_edges # funtion num
function_edge_start = [] function_number = len(data1.raw_graph_list)
function_edge_end = [] acfg_list = []
for item in data1.raw_graph_list[0].old_g.edges: # 函数级特征
function_edge_start.append(item[0]) for i in range(len(data1.raw_graph_list)):
function_edge_end.append(item[1])
function_edges = [function_edge_start, function_edge_end]
fun_name_temp = []
# function hsah
file_hash = calc_sha256(file_path)
acfg_list = [] # function name
# 函数级特征 fun_name_temp.append(data1.raw_graph_list[i].funcname)
for i in range(len(data1.raw_graph_list)): # block features
temp_G = data1.raw_graph_list[i].old_g
# block_number
block_number = len(temp_G.node)
# block_features
acfg_list_item_feature = []
for temp in range(len(temp_G.node)):
block_features = []
# call
block_features.append(temp_G.node[temp]['numCalls'])
# transfer
block_features.append(temp_G.node[temp]['numTIs'])
# arithmetic
block_features.append(temp_G.node[temp]['numAs'])
# logic
block_features.append(temp_G.node[temp]['numLIs'])
# compare
block_features.append(temp_G.node[temp]['numCom'])
# move
block_features.append(temp_G.node[temp]['numMov'])
# termination
block_features.append(temp_G.node[temp]['numTerm'])
# date declaration
block_features.append(temp_G.node[temp]['numDD'])
# total instructions
block_features.append(temp_G.node[temp]['numIns'])
# string or integer constants
block_features.append(len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len(
temp_G.node[temp]['consts']))
# offspring
block_features.append(temp_G.node[temp]['offs'])
acfg_list_item_feature.append(block_features)
edge_list_start = []
edge_list_end = []
for item in temp_G.edges:
edge_list_start.append(item[0])
edge_list_end.append(item[1])
block_edges = [edge_list_start, edge_list_end]
acfg_list_item = {"block_number": block_number, "block_edges": block_edges, "block_features": acfg_list_item_feature}
acfg_list.append(acfg_list_item)
# function name json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp, "hash": file_hash, "function_number": function_number}
fun_name_temp.append(data1.raw_graph_list[i].funcname) json_str = json.dumps(json_temp)
# block features print json_str
temp_G = data1.raw_graph_list[i].old_g
# block_number
block_number = len(temp_G.node)
# block_features
acfg_list_item_feature = []
for temp in range(len(temp_G.node)):
block_features = []
# call
block_features.append(temp_G.node[temp]['numCalls'])
# transfer
block_features.append(temp_G.node[temp]['numTIs'])
# arithmetic
block_features.append(temp_G.node[temp]['numAs'])
# logic
block_features.append(temp_G.node[temp]['numLIs'])
# compare
block_features.append(temp_G.node[temp]['numCom'])
# move
block_features.append(temp_G.node[temp]['numMov'])
# termination
block_features.append(temp_G.node[temp]['numTerm'])
# date declaration
block_features.append(temp_G.node[temp]['numDD'])
# total instructions
block_features.append(temp_G.node[temp]['numIns'])
# string or integer constants
block_features.append(
len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len(
temp_G.node[temp]['consts']))
# offspring
block_features.append(temp_G.node[temp]['offs'])
acfg_list_item_feature.append(block_features)
edge_list_start = []
edge_list_end = []
for item in temp_G.edges:
edge_list_start.append(item[0])
edge_list_end.append(item[1])
block_edges = [edge_list_start, edge_list_end]
acfg_list_item = {"block_number": block_number, "block_edges": block_edges,
"block_features": acfg_list_item_feature}
acfg_list.append(acfg_list_item)
json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp,
"hash": file_hash, "function_number": function_number}
json_str = json.dumps(json_temp)
sample_file.write(json_str)
else:
print "删除文件" + file_path
os.remove(file_path)
sample_file.close()