From 601a61157b3686ddf8ae06eb4a36e16c4f6a112c Mon Sep 17 00:00:00 2001
From: huihun <781165206@qq.com>
Date: Fri, 5 Jan 2024 14:30:45 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=94=B9=E5=8A=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 samples/PreProcess.py            |  5 ++---
 samples/funCount.py              | 20 +++++++++++++++-----
 src/utils/PreProcessedDataset.py |  4 ++--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/samples/PreProcess.py b/samples/PreProcess.py
index b23d29d..9970f53 100644
--- a/samples/PreProcess.py
+++ b/samples/PreProcess.py
@@ -30,12 +30,11 @@ def parse_json_list_2_pyg_object(jsonl_file: str, label: int, vocab: Vocab):
             
             external_function_index_list = [vocab[f_name] for f_name in external_function_name_list]
             index += 1
-            torch.save(Data(hash=item_hash, local_acfgs=acfg_list, external_list=external_function_index_list, function_edges=item_function_edges, targets=label), "./{}.pt".format(index))
-            print(index)
+            torch.save(Data(hash=item_hash, local_acfgs=acfg_list, external_list=external_function_index_list, function_edges=item_function_edges, targets=label), "./cache/benign_{}.pt".format(index))
 
 
 if __name__ == '__main__':
-    json_path = "./sample.jsonl"
+    json_path = "./benign_result.jsonl"
     train_vocab_file = "../data/processed_dataset/train_external_function_name_vocab.jsonl"
     # train_vocab_file = "./res.jsonl"
     max_vocab_size = 10000
diff --git a/samples/funCount.py b/samples/funCount.py
index 165e675..42a1bdb 100644
--- a/samples/funCount.py
+++ b/samples/funCount.py
@@ -3,9 +3,19 @@ import json
 from tqdm import tqdm
 
 if __name__ == '__main__':
-    file_name = './sample.jsonl'
-    fil = open(file_name, mode='r')
+    mal_file_name = './malware_result.jsonl'
+    ben_file_name = './benign-result.jsonl'
+    fil = open(mal_file_name, mode='r')
     fun_name_dict = {}
+    for item in tqdm(fil):
+        item = json.loads(item)
+        item_fun_list = item['function_names']
+        for fun_name in item_fun_list:
+            if fun_name_dict.get(fun_name) is not None:
+                fun_name_dict[fun_name] += 1
+            else:
+                fun_name_dict[fun_name] = 1
+    fil = open(mal_file_name, mode='r')
     for item in tqdm(fil):
         item = json.loads(item)
         item_fun_list = item['function_names']
@@ -15,7 +25,7 @@ if __name__ == '__main__':
             else:
                 fun_name_dict[fun_name] = 1
 
-    with open('./res.jsonl','w') as file:
-        for key,value in fun_name_dict.items():
-            temp = {"f_name":key, "count":value}
+    with open('./res.jsonl', 'w') as file:
+        for key, value in fun_name_dict.items():
+            temp = {"f_name": key, "count": value}
             file.write(json.dumps(temp) + '\n')
diff --git a/src/utils/PreProcessedDataset.py b/src/utils/PreProcessedDataset.py
index dba7991..2797891 100644
--- a/src/utils/PreProcessedDataset.py
+++ b/src/utils/PreProcessedDataset.py
@@ -4,7 +4,7 @@ from datetime import datetime
 
 import torch
 from torch_geometric.data import Dataset, DataLoader
-from utils.RealBatch import create_real_batch_data  # noqa
+from RealBatch import create_real_batch_data  # noqa
 
 
 class MalwareDetectionDataset(Dataset):
@@ -66,7 +66,7 @@ def _simulating(_dataset, _batch_size: int):
 
 
 if __name__ == '__main__':
-    root_path: str = '/home/xiang/MalGraph/data/processed_dataset/DatasetJSON/'
+    root_path: str = '/home/king/python/MalGraph-main/data/processed_dataset/DatasetJSON'
     i_batch_size = 2
     
     train_dataset = MalwareDetectionDataset(root=root_path, train_or_test='train')