批量化操作

This commit is contained in:
huihun 2024-03-01 14:45:10 +08:00
parent 8063d079db
commit 0f1e3378a2
2 changed files with 173 additions and 142 deletions

View File

@ -1,4 +1,5 @@
# coding=utf-8
import hashlib
import pickle as pk
import re
import json
@ -6,15 +7,22 @@ import os
from tqdm import tqdm
def convert(start, end, overhaul):
for workflow in range(start, end):
# workflow = 0
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
def calc_sha256(file_path):
with open(file_path, 'rb') as f:
bytes = f.read()
sha256obj = hashlib.sha256(bytes)
sha256 = sha256obj.hexdigest()
return sha256
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
def convert_malware(overhaul):
cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
raw_dir = "D:\\bishe\\dataset\\train_malware"
log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
if overhaul:
if os.path.exists(log_path):
@ -67,7 +75,7 @@ def convert(start, end, overhaul):
# 为当前pe文件创建json对象
json_obj = {
'hash': data.binary_name[11:],
'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list),
@ -119,12 +127,13 @@ def convert(start, end, overhaul):
def convert_benign(overhaul):
cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
raw_dir = "D:\\bishe\\dataset\\train_benign"
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
if overhaul:
if os.path.exists(log_path):
@ -145,6 +154,7 @@ def convert_benign(overhaul):
continue
name = cfg[:-4] # 纯文件名
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try:
data = pk.load(cfg_file)
@ -180,7 +190,7 @@ def convert_benign(overhaul):
# 为当前pe文件创建json对象
json_obj = {
'hash': data.binary_name[11:],
'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list),
@ -233,4 +243,6 @@ def convert_benign(overhaul):
if __name__ == '__main__':
# convert(35, 69)
convert_benign(False)
# convert_benign(True)
convert_benign(True)
convert_malware(True)

View File

@ -1,54 +1,73 @@
# -*- coding: UTF-8 -*-
import pickle
from func import *
from idc import *
# coding=utf-8
import os
import pickle
import idc
import idaapi
# 定义常量
DATA_DIR = "D:\\bishe\\dataset"
INFECTED_DIR = os.path.join(DATA_DIR, "infected")
BENIGN_DIR = os.path.join(DATA_DIR, "benign")
CFG_EXTENSION = ".ida"
GDL_EXTENSION = ".dot"
ASM_EXTENSION = ".asm"
def preprocess():
# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
# print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
# print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
# print idc.ARGV[2]
# print type(idc.ARGV[2])
def preprocess(binary_name, workflow):
cfg_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
f"{binary_name}{CFG_EXTENSION}"
)
gdl_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
f"{binary_name}{GDL_EXTENSION}"
)
asm_path = os.path.join(
INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
f"{binary_name}{ASM_EXTENSION}"
)
binary_name = idc.GetInputFile()
workflow = idc.ARGV[1]
# workflow为特定值时分析良性软件否则分析恶意软件
if workflow == '-1':
cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
if os.path.exists(cfg_path):
idc.Exit(0)
else:
cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
analysis_flags &= ~idc.AF_IMMOFF
idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
idaapi.autoWait()
# 生成pe文件的cfg列表
cfgs = get_func_cfgs_c(FirstSeg())
# 将cfg保存为.ida
pickle.dump(cfgs, open(cfg_path, 'w'))
# 生成CFG
generate_cfg(binary_name, cfg_path)
# 生成pe文件的fcg保存为.dot文件
# idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件网上几乎找不到gdl这个格式
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
# 生成GDL
generate_gdl(gdl_path)
# 生成.asm文件
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
# 生成ASM
generate_asm(asm_path)
# 关闭IDA Pro
idc.Exit(0)
def generate_cfg(binary_name, cfg_path):
cfgs = get_func_cfgs_c(FirstSeg())
with open(cfg_path, 'wb') as cfg_file:
pickle.dump(cfgs, cfg_file)
# 通用命令行格式 idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path完整命令行如下
# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
if __name__ == '__main__':
preprocess()
def generate_gdl(gdl_path):
idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
def generate_asm(asm_path):
idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
# 主函数
def main():
binary_name = idc.GetInputFile()
try:
workflow = idc.ARGV[1]
except IndexError:
print("Workflow argument not provided.")
return
preprocess(binary_name, workflow)
# 如果是作为IDA Pro的脚本运行调用主函数
if __name__ == "__main__":
main()