# coding=utf-8 import hashlib import pickle as pk import re import json import os from tqdm import tqdm def calc_sha256(file_path): with open(file_path, 'rb') as f: bytes = f.read() sha256obj = hashlib.sha256(bytes) sha256 = sha256obj.hexdigest() return sha256 def convert_malware(overhaul): cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg" output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl" dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot" raw_dir = "D:\\bishe\\dataset\\train_malware" log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log" process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log" if overhaul: if os.path.exists(log_path): os.remove(log_path) if os.path.exists(process_log_path): os.remove(process_log_path) with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: logged = log.readline() if logged == '': log_index = 0 else: log_index = int(logged) for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): if index < log_index: continue name = cfg[:-4] # 纯文件名,不带后缀 cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') try: data = pk.load(cfg_file) except EOFError: process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) continue except ValueError: process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) continue finally: cfg_file.close() dot_file_path = os.path.join(dot_dir, name + '.dot') if not os.path.exists(dot_file_path): process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) else: # 打开dot文件获取fcg raw_function_edges = [] # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 functions_list = [] with open(dot_file_path, 'r') as dot: for line in dot: if '->' in line: raw_function_edges.append(re.findall(r'\b\d+\b', line)) elif 'label' in line: functions_list.append(line[line.find('= "') + 3:line.find('",')]) # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 if raw_function_edges.__len__() == 0: continue # 为当前pe文件创建json对象 json_obj = { 'hash': calc_sha256(raw_dir + "\\" + name), # 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 'function_number': data.raw_graph_list.__len__(), 'function_number': len(functions_list), 'function_edges': [[int(d[0]) for d in raw_function_edges], [int(d[1]) for d in raw_function_edges]], 'acfg_list': [], 'function_names': functions_list } # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 # 读取pkl文件,一个acfg由一个函数分解而来 for acfg in data.raw_graph_list: # 函数为外部函数,不需要构建cfg if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: continue # 这里2是因为Genius框架提取特征时将后代数量放在2 offspring = [d.get('v')[2] for d in acfg.g.node.values()] # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 # 以框架为主,将bb_features数组削减为和g.node长度一致 diff = acfg.g.__len__() - len(acfg.bb_features) if diff != 0: del acfg.bb_features[diff:] # 将后代数量的特征放入bb_features中 for i, offs in enumerate(offspring): acfg.bb_features[i].append(offs) acfg_item = { 'block_number': acfg.g.__len__(), 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], 'block_features': acfg.bb_features } json_obj['acfg_list'].append(acfg_item) # json_obj['function_names'].append(acfg.funcname) # 将结果写入json本地文件 result = json.dumps(json_obj, ensure_ascii=False) with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: out.write(result) log.truncate(0) log.seek(0) log.write(str(index)) log.flush() process_log.write("index {}, {} process done.\n".format(index, cfg)) def convert_benign(overhaul): cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg" dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot" output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl" raw_dir = "D:\\bishe\\dataset\\train_benign" log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log" process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log" if overhaul: if os.path.exists(log_path): os.remove(log_path) if os.path.exists(process_log_path): os.remove(process_log_path) with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: logged = log.readline() if logged == '': log_index = 0 else: log_index = int(logged) cdg_list = os.listdir(cfg_dir) for index, cfg in enumerate(tqdm(cdg_list)): if index < log_index: continue name = cfg[:-4] # 纯文件名 cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') try: data = pk.load(cfg_file) except EOFError: process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) continue except ValueError: process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) continue except KeyError: process_log.write("index {}, {} process failed. KeyError occurred.\n".format(index, cfg)) finally: cfg_file.close() dot_file_path = os.path.join(dot_dir, name + '.dot') if not os.path.exists(dot_file_path): process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) else: # 打开dot文件获取fcg raw_function_edges = [] # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 functions_list = [] with open(dot_file_path, 'r') as dot: for line in dot: if '->' in line: raw_function_edges.append(re.findall(r'\b\d+\b', line)) elif 'label' in line: functions_list.append(line[line.find('= "') + 3:line.find('",')]) # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 if raw_function_edges.__len__() == 0: continue # 为当前pe文件创建json对象 json_obj = { 'hash': calc_sha256(raw_dir + "\\" + name), # 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 'function_number': data.raw_graph_list.__len__(), 'function_number': len(functions_list), 'function_edges': [[int(d[0]) for d in raw_function_edges], [int(d[1]) for d in raw_function_edges]], 'acfg_list': [], 'function_names': functions_list } # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 # 读取pkl文件,一个acfg由一个函数分解而来 for acfg in data.raw_graph_list: # 函数为外部函数,不需要构建cfg if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: continue # 这里2是因为Genius框架提取特征时将后代数量放在2 offspring = [d.get('v')[2] for d in acfg.g.node.values()] # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 # 以框架为主,将bb_features数组削减为和g.node长度一致 diff = acfg.g.__len__() - len(acfg.bb_features) if diff != 0: del acfg.bb_features[diff:] # 将后代数量的特征放入bb_features中 for i, offs in enumerate(offspring): acfg.bb_features[i].append(offs) acfg_item = { 'block_number': acfg.g.__len__(), 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], 'block_features': acfg.bb_features } json_obj['acfg_list'].append(acfg_item) # json_obj['function_names'].append(acfg.funcname) # 将结果写入json本地文件 result = json.dumps(json_obj, ensure_ascii=False) with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: out.write(result) log.truncate(0) log.seek(0) log.write(str(index)) log.flush() process_log.write("index {}, {} process done.\n".format(index, cfg)) if __name__ == '__main__': # convert(35, 69) # convert_benign(True) convert_benign(True) convert_malware(True)