批量化操作
This commit is contained in:
parent
8063d079db
commit
0f1e3378a2
@ -1,4 +1,5 @@
|
||||
# coding=utf-8
|
||||
import hashlib
|
||||
import pickle as pk
|
||||
import re
|
||||
import json
|
||||
@ -6,125 +7,133 @@ import os
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def convert(start, end, overhaul):
|
||||
for workflow in range(start, end):
|
||||
# workflow = 0
|
||||
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
|
||||
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
|
||||
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
|
||||
def calc_sha256(file_path):
|
||||
with open(file_path, 'rb') as f:
|
||||
bytes = f.read()
|
||||
sha256obj = hashlib.sha256(bytes)
|
||||
sha256 = sha256obj.hexdigest()
|
||||
return sha256
|
||||
|
||||
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
|
||||
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
|
||||
|
||||
if overhaul:
|
||||
if os.path.exists(log_path):
|
||||
os.remove(log_path)
|
||||
if os.path.exists(process_log_path):
|
||||
os.remove(process_log_path)
|
||||
def convert_malware(overhaul):
|
||||
cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
|
||||
output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
|
||||
dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
|
||||
raw_dir = "D:\\bishe\\dataset\\train_malware"
|
||||
|
||||
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
||||
logged = log.readline()
|
||||
if logged == '':
|
||||
log_index = 0
|
||||
log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
|
||||
process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
|
||||
|
||||
if overhaul:
|
||||
if os.path.exists(log_path):
|
||||
os.remove(log_path)
|
||||
if os.path.exists(process_log_path):
|
||||
os.remove(process_log_path)
|
||||
|
||||
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
|
||||
logged = log.readline()
|
||||
if logged == '':
|
||||
log_index = 0
|
||||
else:
|
||||
log_index = int(logged)
|
||||
|
||||
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
|
||||
if index < log_index:
|
||||
continue
|
||||
|
||||
name = cfg[:-4] # 纯文件名,不带后缀
|
||||
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
|
||||
try:
|
||||
data = pk.load(cfg_file)
|
||||
except EOFError:
|
||||
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
|
||||
continue
|
||||
except ValueError:
|
||||
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
|
||||
continue
|
||||
finally:
|
||||
cfg_file.close()
|
||||
|
||||
dot_file_path = os.path.join(dot_dir, name + '.dot')
|
||||
if not os.path.exists(dot_file_path):
|
||||
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
|
||||
else:
|
||||
log_index = int(logged)
|
||||
# 打开dot文件获取fcg
|
||||
raw_function_edges = []
|
||||
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数
|
||||
functions_list = []
|
||||
with open(dot_file_path, 'r') as dot:
|
||||
for line in dot:
|
||||
if '->' in line:
|
||||
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
||||
elif 'label' in line:
|
||||
functions_list.append(line[line.find('= "') + 3:line.find('",')])
|
||||
|
||||
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
|
||||
if index < log_index:
|
||||
# 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
|
||||
if raw_function_edges.__len__() == 0:
|
||||
continue
|
||||
|
||||
name = cfg[:-4] # 纯文件名,不带后缀
|
||||
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
|
||||
try:
|
||||
data = pk.load(cfg_file)
|
||||
except EOFError:
|
||||
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
|
||||
continue
|
||||
except ValueError:
|
||||
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
|
||||
continue
|
||||
finally:
|
||||
cfg_file.close()
|
||||
# 为当前pe文件创建json对象
|
||||
json_obj = {
|
||||
'hash': calc_sha256(raw_dir + "\\" + name),
|
||||
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
|
||||
# 'function_number': data.raw_graph_list.__len__(),
|
||||
'function_number': len(functions_list),
|
||||
'function_edges': [[int(d[0]) for d in raw_function_edges],
|
||||
[int(d[1]) for d in raw_function_edges]],
|
||||
'acfg_list': [],
|
||||
'function_names': functions_list
|
||||
}
|
||||
|
||||
dot_file_path = os.path.join(dot_dir, name + '.dot')
|
||||
if not os.path.exists(dot_file_path):
|
||||
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
|
||||
else:
|
||||
# 打开dot文件获取fcg
|
||||
raw_function_edges = []
|
||||
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数
|
||||
functions_list = []
|
||||
with open(dot_file_path, 'r') as dot:
|
||||
for line in dot:
|
||||
if '->' in line:
|
||||
raw_function_edges.append(re.findall(r'\b\d+\b', line))
|
||||
elif 'label' in line:
|
||||
functions_list.append(line[line.find('= "') + 3:line.find('",')])
|
||||
|
||||
# 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
|
||||
if raw_function_edges.__len__() == 0:
|
||||
# 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取
|
||||
# 读取pkl文件,一个acfg由一个函数分解而来
|
||||
for acfg in data.raw_graph_list:
|
||||
# 函数为外部函数,不需要构建cfg
|
||||
if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
|
||||
continue
|
||||
|
||||
# 为当前pe文件创建json对象
|
||||
json_obj = {
|
||||
'hash': data.binary_name[11:],
|
||||
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
|
||||
# 'function_number': data.raw_graph_list.__len__(),
|
||||
'function_number': len(functions_list),
|
||||
'function_edges': [[int(d[0]) for d in raw_function_edges],
|
||||
[int(d[1]) for d in raw_function_edges]],
|
||||
'acfg_list': [],
|
||||
'function_names': functions_list
|
||||
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
||||
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
||||
# 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
|
||||
# 以框架为主,将bb_features数组削减为和g.node长度一致
|
||||
diff = acfg.g.__len__() - len(acfg.bb_features)
|
||||
if diff != 0:
|
||||
del acfg.bb_features[diff:]
|
||||
# 将后代数量的特征放入bb_features中
|
||||
|
||||
for i, offs in enumerate(offspring):
|
||||
acfg.bb_features[i].append(offs)
|
||||
|
||||
acfg_item = {
|
||||
'block_number': acfg.g.__len__(),
|
||||
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
|
||||
'block_features': acfg.bb_features
|
||||
}
|
||||
|
||||
# 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取
|
||||
# 读取pkl文件,一个acfg由一个函数分解而来
|
||||
for acfg in data.raw_graph_list:
|
||||
# 函数为外部函数,不需要构建cfg
|
||||
if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
|
||||
continue
|
||||
json_obj['acfg_list'].append(acfg_item)
|
||||
# json_obj['function_names'].append(acfg.funcname)
|
||||
|
||||
# 这里2是因为Genius框架提取特征时将后代数量放在2
|
||||
offspring = [d.get('v')[2] for d in acfg.g.node.values()]
|
||||
# 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
|
||||
# 以框架为主,将bb_features数组削减为和g.node长度一致
|
||||
diff = acfg.g.__len__() - len(acfg.bb_features)
|
||||
if diff != 0:
|
||||
del acfg.bb_features[diff:]
|
||||
# 将后代数量的特征放入bb_features中
|
||||
# 将结果写入json本地文件
|
||||
result = json.dumps(json_obj, ensure_ascii=False)
|
||||
|
||||
for i, offs in enumerate(offspring):
|
||||
acfg.bb_features[i].append(offs)
|
||||
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
|
||||
out.write(result)
|
||||
|
||||
acfg_item = {
|
||||
'block_number': acfg.g.__len__(),
|
||||
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
|
||||
'block_features': acfg.bb_features
|
||||
}
|
||||
|
||||
json_obj['acfg_list'].append(acfg_item)
|
||||
# json_obj['function_names'].append(acfg.funcname)
|
||||
|
||||
# 将结果写入json本地文件
|
||||
result = json.dumps(json_obj, ensure_ascii=False)
|
||||
|
||||
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
|
||||
out.write(result)
|
||||
|
||||
log.truncate(0)
|
||||
log.seek(0)
|
||||
log.write(str(index))
|
||||
log.flush()
|
||||
process_log.write("index {}, {} process done.\n".format(index, cfg))
|
||||
log.truncate(0)
|
||||
log.seek(0)
|
||||
log.write(str(index))
|
||||
log.flush()
|
||||
process_log.write("index {}, {} process done.\n".format(index, cfg))
|
||||
|
||||
|
||||
def convert_benign(overhaul):
|
||||
cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
|
||||
dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
|
||||
output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
|
||||
cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
|
||||
dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
|
||||
output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
|
||||
raw_dir = "D:\\bishe\\dataset\\train_benign"
|
||||
|
||||
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
|
||||
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
|
||||
log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
|
||||
process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
|
||||
|
||||
if overhaul:
|
||||
if os.path.exists(log_path):
|
||||
@ -145,6 +154,7 @@ def convert_benign(overhaul):
|
||||
continue
|
||||
|
||||
name = cfg[:-4] # 纯文件名
|
||||
|
||||
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
|
||||
try:
|
||||
data = pk.load(cfg_file)
|
||||
@ -180,7 +190,7 @@ def convert_benign(overhaul):
|
||||
|
||||
# 为当前pe文件创建json对象
|
||||
json_obj = {
|
||||
'hash': data.binary_name[11:],
|
||||
'hash': calc_sha256(raw_dir + "\\" + name),
|
||||
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
|
||||
# 'function_number': data.raw_graph_list.__len__(),
|
||||
'function_number': len(functions_list),
|
||||
@ -233,4 +243,6 @@ def convert_benign(overhaul):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# convert(35, 69)
|
||||
convert_benign(False)
|
||||
# convert_benign(True)
|
||||
convert_benign(True)
|
||||
convert_malware(True)
|
||||
|
@ -1,54 +1,73 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import pickle
|
||||
from func import *
|
||||
from idc import *
|
||||
# coding=utf-8
|
||||
import os
|
||||
import pickle
|
||||
import idc
|
||||
import idaapi
|
||||
|
||||
# 定义常量
|
||||
DATA_DIR = "D:\\bishe\\dataset"
|
||||
INFECTED_DIR = os.path.join(DATA_DIR, "infected")
|
||||
BENIGN_DIR = os.path.join(DATA_DIR, "benign")
|
||||
CFG_EXTENSION = ".ida"
|
||||
GDL_EXTENSION = ".dot"
|
||||
ASM_EXTENSION = ".asm"
|
||||
|
||||
def preprocess():
|
||||
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
|
||||
# print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
|
||||
# print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
|
||||
# print idc.ARGV[2]
|
||||
# print type(idc.ARGV[2])
|
||||
def preprocess(binary_name, workflow):
|
||||
cfg_path = os.path.join(
|
||||
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
|
||||
f"{binary_name}{CFG_EXTENSION}"
|
||||
)
|
||||
gdl_path = os.path.join(
|
||||
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
|
||||
f"{binary_name}{GDL_EXTENSION}"
|
||||
)
|
||||
asm_path = os.path.join(
|
||||
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
|
||||
f"{binary_name}{ASM_EXTENSION}"
|
||||
)
|
||||
|
||||
binary_name = idc.GetInputFile()
|
||||
|
||||
workflow = idc.ARGV[1]
|
||||
# workflow为特定值时分析良性软件,否则分析恶意软件
|
||||
if workflow == '-1':
|
||||
cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
|
||||
gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
|
||||
asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
|
||||
if os.path.exists(cfg_path):
|
||||
idc.Exit(0)
|
||||
else:
|
||||
cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
|
||||
gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
|
||||
asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
|
||||
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
|
||||
analysis_flags &= ~idc.AF_IMMOFF
|
||||
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
|
||||
|
||||
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
|
||||
analysis_flags &= ~idc.AF_IMMOFF
|
||||
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
|
||||
idaapi.autoWait()
|
||||
idaapi.autoWait()
|
||||
|
||||
# 生成pe文件的cfg列表
|
||||
# 生成CFG
|
||||
generate_cfg(binary_name, cfg_path)
|
||||
|
||||
# 生成GDL
|
||||
generate_gdl(gdl_path)
|
||||
|
||||
# 生成ASM
|
||||
generate_asm(asm_path)
|
||||
|
||||
# 关闭IDA Pro
|
||||
idc.Exit(0)
|
||||
|
||||
def generate_cfg(binary_name, cfg_path):
|
||||
cfgs = get_func_cfgs_c(FirstSeg())
|
||||
# 将cfg保存为.ida
|
||||
pickle.dump(cfgs, open(cfg_path, 'w'))
|
||||
with open(cfg_path, 'wb') as cfg_file:
|
||||
pickle.dump(cfgs, cfg_file)
|
||||
|
||||
# 生成pe文件的fcg,保存为.dot文件
|
||||
# idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件,网上几乎找不到gdl这个格式
|
||||
def generate_gdl(gdl_path):
|
||||
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
|
||||
|
||||
# 生成.asm文件
|
||||
def generate_asm(asm_path):
|
||||
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
|
||||
|
||||
# 关闭IDA Pro
|
||||
idc.Exit(0)
|
||||
# 主函数
|
||||
def main():
|
||||
binary_name = idc.GetInputFile()
|
||||
try:
|
||||
workflow = idc.ARGV[1]
|
||||
except IndexError:
|
||||
print("Workflow argument not provided.")
|
||||
return
|
||||
preprocess(binary_name, workflow)
|
||||
|
||||
|
||||
# 通用命令行格式 idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
|
||||
# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path,完整命令行如下
|
||||
# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
|
||||
# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
|
||||
if __name__ == '__main__':
|
||||
preprocess()
|
||||
# 如果是作为IDA Pro的脚本运行,调用主函数
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user