diff --git a/.idea/deployment.xml b/.idea/deployment.xml index 81b14c5..2ef2c9d 100644 --- a/.idea/deployment.xml +++ b/.idea/deployment.xml @@ -2,7 +2,14 @@ - + + + + + + + + diff --git a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py new file mode 100644 index 0000000..b28ad47 --- /dev/null +++ b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py @@ -0,0 +1,81 @@ +class HierarchicalGraphNeuralNetwork(nn.Module): + def __init__(self, external_vocab: Vocab): + super(HierarchicalGraphNeuralNetwork, self).__init__() + self.pool = 'global_max_pool' + # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling + cfg_filter_list =[200, 200] + cfg_filter_list.insert(0, 11) + self.cfg_filter_length = len(cfg_filter_list) + cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for + i in range(self.cfg_filter_length - 1)] + cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params) + cfg_constructor = cfg_conv['constructor'] + for i in range(self.cfg_filter_length - 1): + setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i])) + self.dropout = nn.Dropout(p=0.2) + # Hierarchical 2: Function Call Graph (FCG) embedding and pooling + self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2, + embedding_dim=cfg_filter_list[-1], + padding_idx=external_vocab.pad_idx) + fcg_filter_list = [200, 200] + fcg_filter_list.insert(0, cfg_filter_list[-1]) + self.fcg_filter_length = len(fcg_filter_list) + fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for + i in range(self.fcg_filter_length - 1)] + fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params) + fcg_constructor = fcg_conv['constructor'] + for i in range(self.fcg_filter_length - 1): + setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i])) + # Last Projection Function: gradually project with more linear layers + self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2)) + self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4)) + self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6) + self.last_activation = nn.Softmax(dim=1) + + def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list, + bt_all_function_edges: list): + rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch) + x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch) + fcg_list = [] + fcg_internal_list = [] + for idx_batch in range(len(real_bt_positions) - 1): + start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2] + idx_x_cfg = x_cfg_pool[start_pos: end_pos] + fcg_internal_list.append(idx_x_cfg) + idx_x_external = self.external_embedding_layer( + torch.tensor([bt_external_names[idx_batch]], dtype=torch.long)) + idx_x_external = idx_x_external.squeeze(dim=0) + idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0) + idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long) + idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge) + idx_graph_data.validate() + fcg_list.append(idx_graph_data) + fcg_batch = Batch.from_data_list(fcg_list) + # Hierarchical 2: Function Call Graph (FCG) embedding and pooling + rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch) # [batch_size, max_node_size, dim] + x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch) + batch_final = x_fcg_pool + # step last project to the number_of_classes (multiclass) + bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final))) + bt_pred = self.last_activation(bt_final_embed) + return bt_pred + + def forward_cfg_gnn(self, local_batch: Batch): + in_x, edge_index = local_batch.x, local_batch.edge_index + for i in range(self.cfg_filter_length - 1): + out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index) + out_x = torch.nn.functional.relu(out_x, inplace=True) + out_x = self.dropout(out_x) + in_x = out_x + local_batch.x = in_x + return local_batch + + def forward_fcg_gnn(self, function_batch: Batch): + in_x, edge_index = function_batch.x, function_batch.edge_index + for i in range(self.fcg_filter_length - 1): + out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index) + out_x = torch.nn.functional.relu(out_x, inplace=True) + out_x = self.dropout(out_x) + in_x = out_x + function_batch.x = in_x + return function_batch \ No newline at end of file diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index 81afb45..56bae4c 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -6,7 +6,7 @@ import os from tqdm import tqdm -def convert(start, end): +def convert(start, end, overhaul): for workflow in range(start, end): # workflow = 0 cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) @@ -16,6 +16,12 @@ def convert(start, end): log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow) process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow) + if overhaul: + if os.path.exists(log_path): + os.remove(log_path) + if os.path.exists(process_log_path): + os.remove(process_log_path) + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: logged = log.readline() if logged == '': @@ -112,5 +118,116 @@ def convert(start, end): process_log.write("index {}, {} process done.\n".format(index, cfg)) +def convert_benign(overhaul): + cfg_dir = "D:\\hkn\\infected\\datasets\\benign_cfg\\new" + output_dir = "D:\\hkn\\infected\\datasets\\benign_json\\new" + dot_dir = "D:\\hkn\\infected\\datasets\\benign_dot\\new" + + log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log" + process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log" + + if overhaul: + if os.path.exists(log_path): + os.remove(log_path) + if os.path.exists(process_log_path): + os.remove(process_log_path) + + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: + logged = log.readline() + if logged == '': + log_index = 0 + else: + log_index = int(logged) + + for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): + if index < log_index: + continue + + name = cfg[:-4] # 纯文件名 + cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') + try: + data = pk.load(cfg_file) + except EOFError: + process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) + continue + except ValueError: + process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) + continue + finally: + cfg_file.close() + + dot_file_path = os.path.join(dot_dir, name + '.dot') + if not os.path.exists(dot_file_path): + process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) + else: + # 打开dot文件获取fcg + raw_function_edges = [] + # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数,data.raw_graph_list仅包含了内部函数 + functions_list = [] + with open(dot_file_path, 'r') as dot: + for line in dot: + if '->' in line: + raw_function_edges.append(re.findall(r'\b\d+\b', line)) + elif 'label' in line: + functions_list.append(line[line.find('= "') + 3:line.find('",')]) + + # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 + if raw_function_edges.__len__() == 0: + continue + + # 为当前pe文件创建json对象 + json_obj = { + 'hash': data.binary_name[11:], + # 2023.8.12 bug fix: 这里获取的是内部函数的数量 + # 'function_number': data.raw_graph_list.__len__(), + 'function_number': len(functions_list), + 'function_edges': [[int(d[0]) for d in raw_function_edges], + [int(d[1]) for d in raw_function_edges]], + 'acfg_list': [], + 'function_names': functions_list + } + + # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数,不包括外部函数,因此函数列表和函数数量不能从这里获取 + # 读取pkl文件,一个acfg由一个函数分解而来 + for acfg in data.raw_graph_list: + # 函数为外部函数,不需要构建cfg + if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname: + continue + + # 这里2是因为Genius框架提取特征时将后代数量放在2 + offspring = [d.get('v')[2] for d in acfg.g.node.values()] + # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的 + # 以框架为主,将bb_features数组削减为和g.node长度一致 + diff = acfg.g.__len__() - len(acfg.bb_features) + if diff != 0: + del acfg.bb_features[diff:] + # 将后代数量的特征放入bb_features中 + + for i, offs in enumerate(offspring): + acfg.bb_features[i].append(offs) + + acfg_item = { + 'block_number': acfg.g.__len__(), + 'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], + 'block_features': acfg.bb_features + } + + json_obj['acfg_list'].append(acfg_item) + # json_obj['function_names'].append(acfg.funcname) + + # 将结果写入json本地文件 + result = json.dumps(json_obj, ensure_ascii=False) + + with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out: + out.write(result) + + log.truncate(0) + log.seek(0) + log.write(str(index)) + log.flush() + process_log.write("index {}, {} process done.\n".format(index, cfg)) + + if __name__ == '__main__': - convert(35, 69) + # convert(35, 69) + convert_benign(True) diff --git a/Genius3/raw-feature-extractor/ida_batch.py b/Genius3/raw-feature-extractor/ida_batch.py index 490a24d..8f5bcdf 100644 --- a/Genius3/raw-feature-extractor/ida_batch.py +++ b/Genius3/raw-feature-extractor/ida_batch.py @@ -19,10 +19,75 @@ def call_preprocess(cmd_line): subprocess.call(cmd_line, shell=True) -def batch_mode(start, end): +# 良性软件分析模式,ida的命令中将workflow改为-1 +def benign_batch_mode(overhaul): + # 总失败数据数量 + total_failed = 0 + + log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log' + process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log' + benign_pe_dir = 'D:\\hkn\\infected\\datasets\\benign\\new' + + if overhaul: + if os.path.exists(log_path): + os.remove(log_path) + if os.path.exists(process_log_path): + os.remove(process_log_path) + + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: + logged = log.readline() + if logged == '': + log_index = 0 + else: + log_index = int(logged) + + for index, pe in enumerate(tqdm(sorted(os.listdir(benign_pe_dir)))): + if index < log_index: + continue + + cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format( + os.path.join(benign_pe_dir, pe)) + + p = multiprocessing.Process(target=call_preprocess, args=[cmd_line]) + p.start() + flag_kill = True + start = time.time() + while time.time() - start <= TIMEOUT: + if not p.is_alive(): + flag_kill = False + break + else: + time.sleep(1) + + if flag_kill: + subprocess.call('taskkill /im idaq64.exe /f') + process_log.write( + "index {}, {} stuck, process terminated.\n".format(index, pe)) + + total_failed += 1 + else: + # 正常运行结束 + log.truncate(0) + log.seek(0) + log.write(str(index)) + log.flush() + process_log.write("index {}, {} process done.\n".format(index, pe)) + # 所有副产物删除 + delete_output() + + print('总失败数{}'.format(total_failed)) + + +def mal_batch_mode(start, end): # 只选其中这些类的pe进行分析,其他的就直接跳过 families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0, 'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0} + # 记录ida处理报错的数据来自哪些家族 + failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0, + 'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0} + # 总失败数据数量 + total_failed = 0 + for workflow in range(start, end): # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test' pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow) @@ -73,6 +138,9 @@ def batch_mode(start, end): subprocess.call('taskkill /im idaq64.exe /f') process_log.write( "index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow)) + + failed_family[pe_family] += 1 + total_failed += 1 else: # 正常运行结束 log.truncate(0) @@ -85,6 +153,10 @@ def batch_mode(start, end): # 一次workflow结束后将所有副产物删除 delete_output() + print(families_need_to_analyze) + print('\n') + print(failed_family, '总失败数{}'.format(total_failed)) + def delete_output(): out_dir = 'F:\\iout' @@ -96,4 +168,5 @@ def delete_output(): # 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库 # F:\\kkk\\IDA_6.6 if __name__ == '__main__': - batch_mode(36, 69) + benign_batch_mode(True) + # mal_batch_mode(35, 69) diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py index fd24ec7..507d83c 100644 --- a/Genius3/raw-feature-extractor/preprocessing_ida.py +++ b/Genius3/raw-feature-extractor/preprocessing_ida.py @@ -1,11 +1,8 @@ # -*- coding: UTF-8 -*- import pickle from func import * -from raw_graphs import * from idc import * -import idautils import os -import sys def preprocess(): @@ -18,9 +15,13 @@ def preprocess(): binary_name = idc.GetInputFile() workflow = idc.ARGV[1] - # workflow = 0 - cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) - gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name) + # workflow为特定值时分析良性软件,否则分析恶意软件 + if workflow == '-1': + cfg_path = "D:\\hkn\\infected\\datasets\\benign_cfg\\new" + gdl_path = "D:\\hkn\\infected\\datasets\\benign_dot\\new\\{}.dot".format(binary_name) + else: + cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) + gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name) analysis_flags = idc.GetShortPrm(idc.INF_START_AF) analysis_flags &= ~idc.AF_IMMOFF diff --git a/Genius3/raw-feature-extractor/test.py b/Genius3/raw-feature-extractor/test.py index 6e3c460..722739c 100644 --- a/Genius3/raw-feature-extractor/test.py +++ b/Genius3/raw-feature-extractor/test.py @@ -46,7 +46,6 @@ def create(parent_dir, folder): os.mkdir(os.path.join(parent_dir, folder)) - def change_max_item_lines(): f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb') s = f.read() @@ -89,6 +88,7 @@ def delete_error(): def check_json(): + print('start checking json') for workflow in tqdm(range(0, 69)): json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) for json_file in os.listdir(json_dir): @@ -99,9 +99,13 @@ def check_json(): continue finally: f.close() - for acfg in data['acfg_list']: - if acfg['block_number'] != len(acfg['block_features']): - print("{} {}\n".format(workflow, json_file)) + + if len(data['function_edges'][0]) == 0: + print("{} {} function_edges null\n".format(workflow, json_file)) + # continue + # for acfg in data['acfg_list']: + # if acfg['block_number'] != len(acfg['block_features']): + # print("{} {}\n".format(workflow, json_file)) # 临时函数,删除所有jsonl文件 @@ -112,21 +116,44 @@ def delete_jsonl(): os.remove(os.path.join(json_dir, f)) -# 临时函数,重命名pt文件使之与代码相符 -def rename(): +def delete_all_local(): + src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' + dirs = ['train_malware', 'test_malware', 'valid_malware', 'train_benign', 'test_benign', 'valid_benign', + 'train_malware_backup', 'test_malware_backup', 'valid_malware_backup'] + for d in dirs: + path = os.path.join(src, d) + for f in os.listdir(path): + os.remove(os.path.join(path, f)) + + +# 重命名pt文件使之与代码相符 +def rename(mal_or_be, postfix): tag_set = ['train', 'test', 'valid'] for tag in tag_set: - data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag) + data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix) for index, f in enumerate(os.listdir(data_dir)): os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f)) for tag in tag_set: - data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_malware/'.format(tag) + data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix) for index, f in enumerate(os.listdir(data_dir)): - os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'malware_{}.pt'.format(index))) + os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index))) -def split_samples(): - path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' +def split_samples(flag): + postfix = '' + if flag == 'one_family': + path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\one_family_malware' + tag = 'malware' + elif flag == 'standard': + path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all' + postfix = '_backup' + tag = 'malware' + elif flag == 'benign': + path = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all_benign' + tag = 'benign' + else: + return + out = 'D:\\hkn\\infected\\datasets\\proprecessed_pt' os_list = os.listdir(path) random.shuffle(os_list) @@ -135,11 +162,12 @@ def split_samples(): test_len = int(train_len / 8) for index, f in enumerate(os_list): if index < train_len: - shutil.copy(os.path.join(path, f), os.path.join(out, 'train_malware')) + shutil.copy(os.path.join(path, f), os.path.join(out, 'train_{}'.format(tag) + postfix)) elif train_len <= index < train_len + test_len: - shutil.copy(os.path.join(path, f), os.path.join(out, 'test_malware')) + shutil.copy(os.path.join(path, f), os.path.join(out, 'test_{}'.format(tag) + postfix)) else: - shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_malware')) + shutil.copy(os.path.join(path, f), os.path.join(out, 'valid_{}'.format(tag) + postfix)) + rename(tag, postfix) def half_divide(): @@ -206,6 +234,19 @@ def del_redundant(): os.remove(os.path.join(pe_dir, name)) +def delete_pe(): + dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot' + cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg' + dot_list = os.listdir(dot_dir) + for cfg in os.listdir(cfg_dir): + name = cfg[:-4] + ".dot" + if name in dot_list: + continue + else: + print(os.path.join(dot_dir, name)) + # os.remove(os.path.join(dot_dir, cfg)) + + if __name__ == '__main__': # create_dir() # change_max_item_lines() @@ -213,11 +254,22 @@ if __name__ == '__main__': # delete_error() # test() # delete_jsonl() + delete_all_local() # check_json() - split_samples() - # rename() + # delete_pe() + + # rename('malware', '_backup') + + # 指定 'standard' or 'benign' or 'one_family' + # standard表示处理所有恶意样本 + # split_samples('standard') + # one_family表示仅处理一个家族,仅用于测试原模型的二分类 + # split_samples('one_family') + # benign表示处理良性样本 + # split_samples('benign') + # half_divide() # copy_train_data() # clear_dot() # read_test() - # del_redundant() + # del_redundant() \ No newline at end of file