From 65d25d42de1cfc7066a05e6ee821a018ca13c309 Mon Sep 17 00:00:00 2001 From: huihun <781165206@qq.com> Date: Fri, 1 Mar 2024 16:42:02 +0800 Subject: [PATCH] =?UTF-8?q?py2.7=E5=8F=AF=E8=A1=8C=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../convert_pkl_to_json.py | 216 +++++++++--------- 1 file changed, 114 insertions(+), 102 deletions(-) diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index 837483b..f7b3a52 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -1,4 +1,5 @@ # coding=utf-8 +import hashlib import pickle as pk import re import json @@ -6,125 +7,133 @@ import os from tqdm import tqdm -def convert(start, end, overhaul): - for workflow in range(start, end): - # workflow = 0 - cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) - output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow) - dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow) +def calc_sha256(file_path): + with open(file_path, 'rb') as f: + bytes = f.read() + sha256obj = hashlib.sha256(bytes) + sha256 = sha256obj.hexdigest() + return sha256 - log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow) - process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow) - if overhaul: - if os.path.exists(log_path): - os.remove(log_path) - if os.path.exists(process_log_path): - os.remove(process_log_path) +def convert_malware(overhaul): + cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg" + output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl" + dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot" + raw_dir = "D:\\bishe\\dataset\\train_malware" - with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: - logged = log.readline() - if logged == '': - log_index = 0 + log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log" + process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log" + + if overhaul: + if os.path.exists(log_path): + os.remove(log_path) + if os.path.exists(process_log_path): + os.remove(process_log_path) + + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: + logged = log.readline() + if logged == '': + log_index = 0 + else: + log_index = int(logged) + + for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): + if index < log_index: + continue + + name = cfg[:-4] # 纯文件名,不带后缀 + cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') + try: + data = pk.load(cfg_file) + except EOFError: + process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) + continue + except ValueError: + process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) + continue + finally: + cfg_file.close() + + dot_file_path = os.path.join(dot_dir, name + '.dot') + if not os.path.exists(dot_file_path): + process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) else: - log_index = int(logged) + # 打开dot文件获取fcg + raw_function_edges = [] + # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 + functions_list = [] + with open(dot_file_path, 'r') as dot: + for line in dot: + if '->' in line: + raw_function_edges.append(re.findall(r'\b\d+\b', line)) + elif 'label' in line: + functions_list.append(line[line.find('= "') + 3:line.find('",')]) - for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): - if index < log_index: + # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 + if raw_function_edges.__len__() == 0: continue - name = cfg[:-4] # 纯文件名,不带后缀 - cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') - try: - data = pk.load(cfg_file) - except EOFError: - process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) - continue - except ValueError: - process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) - continue - finally: - cfg_file.close() + # 为当前pe文件创建json对象 + json_obj = { + 'hash': calc_sha256(raw_dir + "\\" + name), + # 2023.8.12 bug fix: 这里获取的是内部函数的数量 + # 'function_number': data.raw_graph_list.__len__(), + 'function_number': len(functions_list), + 'function_edges': [[int(d[0]) for d in raw_function_edges], + [int(d[1]) for d in raw_function_edges]], + 'acfg_list': [], + 'function_names': functions_list + } - dot_file_path = os.path.join(dot_dir, name + '.dot') - if not os.path.exists(dot_file_path): - process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) - else: - # 打开dot文件获取fcg - raw_function_edges = [] - # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 - functions_list = [] - with open(dot_file_path, 'r') as dot: - for line in dot: - if '->' in line: - raw_function_edges.append(re.findall(r'\b\d+\b', line)) - elif 'label' in line: - functions_list.append(line[line.find('= "') + 3:line.find('",')]) - - # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 - if raw_function_edges.__len__() == 0: + # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 + # 读取pkl文件,一个acfg由一个函数分解而来 + for acfg in data.raw_graph_list: + # 函数为外部函数,不需要构建cfg + if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: continue - # 为当前pe文件创建json对象 - json_obj = { - 'hash': data.binary_name[11:], - # 2023.8.12 bug fix: 这里获取的是内部函数的数量 - # 'function_number': data.raw_graph_list.__len__(), - 'function_number': len(functions_list), - 'function_edges': [[int(d[0]) for d in raw_function_edges], - [int(d[1]) for d in raw_function_edges]], - 'acfg_list': [], - 'function_names': functions_list + # 这里2是因为Genius框架提取特征时将后代数量放在2 + offspring = [d.get('v')[2] for d in acfg.g.node.values()] + # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 + # 以框架为主,将bb_features数组削减为和g.node长度一致 + diff = acfg.g.__len__() - len(acfg.bb_features) + if diff != 0: + del acfg.bb_features[diff:] + # 将后代数量的特征放入bb_features中 + + for i, offs in enumerate(offspring): + acfg.bb_features[i].append(offs) + + acfg_item = { + 'block_number': acfg.g.__len__(), + 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], + 'block_features': acfg.bb_features } - # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 - # 读取pkl文件,一个acfg由一个函数分解而来 - for acfg in data.raw_graph_list: - # 函数为外部函数,不需要构建cfg - if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: - continue + json_obj['acfg_list'].append(acfg_item) + # json_obj['function_names'].append(acfg.funcname) - # 这里2是因为Genius框架提取特征时将后代数量放在2 - offspring = [d.get('v')[2] for d in acfg.g.node.values()] - # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 - # 以框架为主,将bb_features数组削减为和g.node长度一致 - diff = acfg.g.__len__() - len(acfg.bb_features) - if diff != 0: - del acfg.bb_features[diff:] - # 将后代数量的特征放入bb_features中 + # 将结果写入json本地文件 + result = json.dumps(json_obj, ensure_ascii=False) - for i, offs in enumerate(offspring): - acfg.bb_features[i].append(offs) + with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: + out.write(result) - acfg_item = { - 'block_number': acfg.g.__len__(), - 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], - 'block_features': acfg.bb_features - } - - json_obj['acfg_list'].append(acfg_item) - # json_obj['function_names'].append(acfg.funcname) - - # 将结果写入json本地文件 - result = json.dumps(json_obj, ensure_ascii=False) - - with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: - out.write(result) - - log.truncate(0) - log.seek(0) - log.write(str(index)) - log.flush() - process_log.write("index {}, {} process done.\n".format(index, cfg)) + log.truncate(0) + log.seek(0) + log.write(str(index)) + log.flush() + process_log.write("index {}, {} process done.\n".format(index, cfg)) def convert_benign(overhaul): - cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" - dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot" - output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl" + cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg" + dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot" + output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl" + raw_dir = "D:\\bishe\\dataset\\train_benign" - log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log" - process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log" + log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log" + process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log" if overhaul: if os.path.exists(log_path): @@ -145,6 +154,7 @@ def convert_benign(overhaul): continue name = cfg[:-4] # 纯文件名 + cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') try: data = pk.load(cfg_file) @@ -180,7 +190,7 @@ def convert_benign(overhaul): # 为当前pe文件创建json对象 json_obj = { - 'hash': data.binary_name[11:], + 'hash': calc_sha256(raw_dir + "\\" + name), # 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 'function_number': data.raw_graph_list.__len__(), 'function_number': len(functions_list), @@ -233,4 +243,6 @@ def convert_benign(overhaul): if __name__ == '__main__': # convert(35, 69) - convert_benign(False) + # convert_benign(True) + convert_benign(True) + convert_malware(True) \ No newline at end of file