diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index 837483b..7807f52 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -1,4 +1,5 @@ # coding=utf-8 +import hashlib import pickle as pk import re import json @@ -6,125 +7,133 @@ import os from tqdm import tqdm -def convert(start, end, overhaul): - for workflow in range(start, end): - # workflow = 0 - cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) - output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow) - dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow) +def calc_sha256(file_path): + with open(file_path, 'rb') as f: + bytes = f.read() + sha256obj = hashlib.sha256(bytes) + sha256 = sha256obj.hexdigest() + return sha256 - log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow) - process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow) - if overhaul: - if os.path.exists(log_path): - os.remove(log_path) - if os.path.exists(process_log_path): - os.remove(process_log_path) +def convert_malware(overhaul): + cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg" + output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl" + dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot" + raw_dir = "D:\\bishe\\dataset\\train_malware" - with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: - logged = log.readline() - if logged == '': - log_index = 0 + log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log" + process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log" + + if overhaul: + if os.path.exists(log_path): + os.remove(log_path) + if os.path.exists(process_log_path): + os.remove(process_log_path) + + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: + logged = log.readline() + if logged == '': + log_index = 0 + else: + log_index = int(logged) + + for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): + if index < log_index: + continue + + name = cfg[:-4] # 纯文件名,不带后缀 + cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') + try: + data = pk.load(cfg_file) + except EOFError: + process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) + continue + except ValueError: + process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) + continue + finally: + cfg_file.close() + + dot_file_path = os.path.join(dot_dir, name + '.dot') + if not os.path.exists(dot_file_path): + process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) else: - log_index = int(logged) + # 打开dot文件获取fcg + raw_function_edges = [] + # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 + functions_list = [] + with open(dot_file_path, 'r') as dot: + for line in dot: + if '->' in line: + raw_function_edges.append(re.findall(r'\b\d+\b', line)) + elif 'label' in line: + functions_list.append(line[line.find('= "') + 3:line.find('",')]) - for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): - if index < log_index: + # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 + if raw_function_edges.__len__() == 0: continue - name = cfg[:-4] # 纯文件名,不带后缀 - cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') - try: - data = pk.load(cfg_file) - except EOFError: - process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) - continue - except ValueError: - process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) - continue - finally: - cfg_file.close() + # 为当前pe文件创建json对象 + json_obj = { + 'hash': calc_sha256(raw_dir + "\\" + name), + # 2023.8.12 bug fix: 这里获取的是内部函数的数量 + # 'function_number': data.raw_graph_list.__len__(), + 'function_number': len(functions_list), + 'function_edges': [[int(d[0]) for d in raw_function_edges], + [int(d[1]) for d in raw_function_edges]], + 'acfg_list': [], + 'function_names': functions_list + } - dot_file_path = os.path.join(dot_dir, name + '.dot') - if not os.path.exists(dot_file_path): - process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) - else: - # 打开dot文件获取fcg - raw_function_edges = [] - # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 - functions_list = [] - with open(dot_file_path, 'r') as dot: - for line in dot: - if '->' in line: - raw_function_edges.append(re.findall(r'\b\d+\b', line)) - elif 'label' in line: - functions_list.append(line[line.find('= "') + 3:line.find('",')]) - - # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 - if raw_function_edges.__len__() == 0: + # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 + # 读取pkl文件,一个acfg由一个函数分解而来 + for acfg in data.raw_graph_list: + # 函数为外部函数,不需要构建cfg + if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: continue - # 为当前pe文件创建json对象 - json_obj = { - 'hash': data.binary_name[11:], - # 2023.8.12 bug fix: 这里获取的是内部函数的数量 - # 'function_number': data.raw_graph_list.__len__(), - 'function_number': len(functions_list), - 'function_edges': [[int(d[0]) for d in raw_function_edges], - [int(d[1]) for d in raw_function_edges]], - 'acfg_list': [], - 'function_names': functions_list + # 这里2是因为Genius框架提取特征时将后代数量放在2 + offspring = [d.get('v')[2] for d in acfg.g.node.values()] + # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 + # 以框架为主,将bb_features数组削减为和g.node长度一致 + diff = acfg.g.__len__() - len(acfg.bb_features) + if diff != 0: + del acfg.bb_features[diff:] + # 将后代数量的特征放入bb_features中 + + for i, offs in enumerate(offspring): + acfg.bb_features[i].append(offs) + + acfg_item = { + 'block_number': acfg.g.__len__(), + 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], + 'block_features': acfg.bb_features } - # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 - # 读取pkl文件,一个acfg由一个函数分解而来 - for acfg in data.raw_graph_list: - # 函数为外部函数,不需要构建cfg - if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: - continue + json_obj['acfg_list'].append(acfg_item) + # json_obj['function_names'].append(acfg.funcname) - # 这里2是因为Genius框架提取特征时将后代数量放在2 - offspring = [d.get('v')[2] for d in acfg.g.node.values()] - # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 - # 以框架为主,将bb_features数组削减为和g.node长度一致 - diff = acfg.g.__len__() - len(acfg.bb_features) - if diff != 0: - del acfg.bb_features[diff:] - # 将后代数量的特征放入bb_features中 + # 将结果写入json本地文件 + result = json.dumps(json_obj, ensure_ascii=False) - for i, offs in enumerate(offspring): - acfg.bb_features[i].append(offs) + with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: + out.write(result) - acfg_item = { - 'block_number': acfg.g.__len__(), - 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], - 'block_features': acfg.bb_features - } - - json_obj['acfg_list'].append(acfg_item) - # json_obj['function_names'].append(acfg.funcname) - - # 将结果写入json本地文件 - result = json.dumps(json_obj, ensure_ascii=False) - - with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: - out.write(result) - - log.truncate(0) - log.seek(0) - log.write(str(index)) - log.flush() - process_log.write("index {}, {} process done.\n".format(index, cfg)) + log.truncate(0) + log.seek(0) + log.write(str(index)) + log.flush() + process_log.write("index {}, {} process done.\n".format(index, cfg)) def convert_benign(overhaul): - cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" - dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot" - output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl" + cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg" + dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot" + output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl" + raw_dir = "D:\\bishe\\dataset\\train_benign" - log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log" - process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log" + log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log" + process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log" if overhaul: if os.path.exists(log_path): @@ -145,6 +154,7 @@ def convert_benign(overhaul): continue name = cfg[:-4] # 纯文件名 + cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') try: data = pk.load(cfg_file) @@ -180,7 +190,7 @@ def convert_benign(overhaul): # 为当前pe文件创建json对象 json_obj = { - 'hash': data.binary_name[11:], + 'hash': calc_sha256(raw_dir + "\\" + name), # 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 'function_number': data.raw_graph_list.__len__(), 'function_number': len(functions_list), @@ -233,4 +243,6 @@ def convert_benign(overhaul): if __name__ == '__main__': # convert(35, 69) - convert_benign(False) + # convert_benign(True) + convert_benign(True) + convert_malware(True) diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py index 14afd45..4744c07 100644 --- a/Genius3/raw-feature-extractor/preprocessing_ida.py +++ b/Genius3/raw-feature-extractor/preprocessing_ida.py @@ -1,54 +1,73 @@ -# -*- coding: UTF-8 -*- -import pickle -from func import * -from idc import * +# coding=utf-8 import os +import pickle +import idc +import idaapi +# 定义常量 +DATA_DIR = "D:\\bishe\\dataset" +INFECTED_DIR = os.path.join(DATA_DIR, "infected") +BENIGN_DIR = os.path.join(DATA_DIR, "benign") +CFG_EXTENSION = ".ida" +GDL_EXTENSION = ".dot" +ASM_EXTENSION = ".asm" -def preprocess(): - # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter - # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py'] - # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs'] - # print idc.ARGV[2] - # print type(idc.ARGV[2]) +def preprocess(binary_name, workflow): + cfg_path = os.path.join( + INFECTED_DIR if workflow != "-1" else BENIGN_DIR, + f"{binary_name}{CFG_EXTENSION}" + ) + gdl_path = os.path.join( + INFECTED_DIR if workflow != "-1" else BENIGN_DIR, + f"{binary_name}{GDL_EXTENSION}" + ) + asm_path = os.path.join( + INFECTED_DIR if workflow != "-1" else BENIGN_DIR, + f"{binary_name}{ASM_EXTENSION}" + ) - binary_name = idc.GetInputFile() - - workflow = idc.ARGV[1] - # workflow为特定值时分析良性软件,否则分析恶意软件 - if workflow == '-1': - cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name) - gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name) - asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name) + if os.path.exists(cfg_path): + idc.Exit(0) else: - cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name) - gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name) - asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name) + analysis_flags = idc.GetShortPrm(idc.INF_START_AF) + analysis_flags &= ~idc.AF_IMMOFF + idc.SetShortPrm(idc.INF_START_AF, analysis_flags) - analysis_flags = idc.GetShortPrm(idc.INF_START_AF) - analysis_flags &= ~idc.AF_IMMOFF - idc.SetShortPrm(idc.INF_START_AF, analysis_flags) - idaapi.autoWait() + idaapi.autoWait() - # 生成pe文件的cfg列表 + # 生成CFG + generate_cfg(binary_name, cfg_path) + + # 生成GDL + generate_gdl(gdl_path) + + # 生成ASM + generate_asm(asm_path) + + # 关闭IDA Pro + idc.Exit(0) + +def generate_cfg(binary_name, cfg_path): cfgs = get_func_cfgs_c(FirstSeg()) - # 将cfg保存为.ida - pickle.dump(cfgs, open(cfg_path, 'w')) + with open(cfg_path, 'wb') as cfg_file: + pickle.dump(cfgs, cfg_file) - # 生成pe文件的fcg,保存为.dot文件 - # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件,网上几乎找不到gdl这个格式 +def generate_gdl(gdl_path): idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT) - # 生成.asm文件 +def generate_asm(asm_path): idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0) - # 关闭IDA Pro - idc.Exit(0) +# 主函数 +def main(): + binary_name = idc.GetInputFile() + try: + workflow = idc.ARGV[1] + except IndexError: + print("Workflow argument not provided.") + return + preprocess(binary_name, workflow) - -# 通用命令行格式 idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073 -# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path,完整命令行如下 -# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82 -# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf -if __name__ == '__main__': - preprocess() +# 如果是作为IDA Pro的脚本运行,调用主函数 +if __name__ == "__main__": + main()