diff --git a/samples/PreProcess.py b/samples/PreProcess.py index b23d29d..9970f53 100644 --- a/samples/PreProcess.py +++ b/samples/PreProcess.py @@ -30,12 +30,11 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab): external_function_index_list = [vocab[f_name] for f_name in external_function_name_list] index += 1 - torch.save(Data(hash=item_hash, local_acfgs=acfg_list, external_list=external_function_index_list, function_edges=item_function_edges, targets=label), "./{}.pt".format(index)) - print(index) + torch.save(Data(hash=item_hash, local_acfgs=acfg_list, external_list=external_function_index_list, function_edges=item_function_edges, targets=label), "./cache/benign_{}.pt".format(index)) if __name__ == '__main__': - json_path = "./sample.jsonl" + json_path = "./benign_result.jsonl" train_vocab_file = "../data/processed_dataset/train_external_function_name_vocab.jsonl" # train_vocab_file = "./res.jsonl" max_vocab_size = 10000 diff --git a/samples/funCount.py b/samples/funCount.py index 165e675..42a1bdb 100644 --- a/samples/funCount.py +++ b/samples/funCount.py @@ -3,9 +3,19 @@ import json from tqdm import tqdm if __name__ == '__main__': - file_name = './sample.jsonl' - fil = open(file_name, mode='r') + mal_file_name = './malware_result.jsonl' + ben_file_name = './benign-result.jsonl' + fil = open(mal_file_name, mode='r') fun_name_dict = {} + for item in tqdm(fil): + item = json.loads(item) + item_fun_list = item['function_names'] + for fun_name in item_fun_list: + if fun_name_dict.get(fun_name) is not None: + fun_name_dict[fun_name] += 1 + else: + fun_name_dict[fun_name] = 1 + fil = open(mal_file_name, mode='r') for item in tqdm(fil): item = json.loads(item) item_fun_list = item['function_names'] @@ -15,7 +25,7 @@ if __name__ == '__main__': else: fun_name_dict[fun_name] = 1 - with open('./res.jsonl','w') as file: - for key,value in fun_name_dict.items(): - temp = {"f_name":key, "count":value} + with open('./res.jsonl', 'w') as file: + for key, value in fun_name_dict.items(): + temp = {"f_name": key, "count": value} file.write(json.dumps(temp) + '\n') diff --git a/src/utils/PreProcessedDataset.py b/src/utils/PreProcessedDataset.py index dba7991..2797891 100644 --- a/src/utils/PreProcessedDataset.py +++ b/src/utils/PreProcessedDataset.py @@ -4,7 +4,7 @@ from datetime import datetime import torch from torch_geometric.data import Dataset, DataLoader -from utils.RealBatch import create_real_batch_data # noqa +from RealBatch import create_real_batch_data # noqa class MalwareDetectionDataset(Dataset): @@ -66,7 +66,7 @@ def _simulating(_dataset, _batch_size: int): if __name__ == '__main__': - root_path: str = '/home/xiang/MalGraph/data/processed_dataset/DatasetJSON/' + root_path: str = '/home/king/python/MalGraph-main/data/processed_dataset/DatasetJSON' i_batch_size = 2 train_dataset = MalwareDetectionDataset(root=root_path, train_or_test='train')