From facd48a0b7bea98c631eff55329fb3913ce48305 Mon Sep 17 00:00:00 2001
From: huihun <781165206@qq.com>
Date: Wed, 1 May 2024 13:09:47 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A4=87=E4=BB=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 configs/default.yaml  |  6 +++---
 samples/PreProcess.py | 11 ++++++++---
 src/DistTrainModel.py | 19 ++++++++++++++-----
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/configs/default.yaml b/configs/default.yaml
index a1d320e..e39082c 100644
--- a/configs/default.yaml
+++ b/configs/default.yaml
@@ -3,19 +3,19 @@ Data:
   train_vocab_file: "/home/king/python/data/fun_name_sort.jsonl"
   max_vocab_size: 10000         # modify according to the result of 1BuildExternalVocab.py
 Training:
-  cuda: False                    # enable GPU training if cuda is available
+  cuda: True                    # enable GPU training if cuda is available
   dist_backend: "nccl"          # if using torch.distribution, the backend to be used
   dist_port: "1234"
   max_epoches: 10
   train_batch_size: 16
-  test_batch_size: 32
+  test_batch_size: 8
   seed: 19920208
   only_test_path: 'None'
 Model:
   ablation_models: "Full"       # "Full"
   gnn_type: "GraphSAGE"         # "GraphSAGE" / "GCN"
   pool_type: "global_max_pool"  # "global_max_pool" / "global_mean_pool"
-  acfg_node_init_dims: 11
+  acfg_node_init_dims: 32
   cfg_filters: "200-200"
   fcg_filters: "200-200"
   number_classes: 1
diff --git a/samples/PreProcess.py b/samples/PreProcess.py
index 24fae78..5ca58a3 100644
--- a/samples/PreProcess.py
+++ b/samples/PreProcess.py
@@ -39,7 +39,12 @@ def json_to_pt(file: str, label: int, vocab: Vocab, save_path: str, file_type: s
         os.makedirs(save_path+f"{train_type}_{file_type}/")
     with open(file, "r", encoding="utf-8") as item:
         line = item.readline()
-        item = json.loads(line)
+        try:
+            item = json.loads(line)
+        except json.decoder.JSONDecodeError as e:
+            print(e)
+            print(file)
+            return False
         item_hash = item['hash']
         acfg_list = []
         for one_acfg in item['acfg_list']:  # list of dict of acfg
@@ -74,7 +79,7 @@ if __name__ == '__main__':
     file_type = ["malware", "benign"]
     max_vocab_size = 10000
     vocabulary = Vocab(freq_file=train_vocab_file, max_vocab_size=max_vocab_size)
-    parse_json_list_2_pyg_object(jsonl_file=malware_json_path, label=1, vocab=vocabulary, save_path=save_vocab_file,
-                                 file_type=file_type[0])
+    # parse_json_list_2_pyg_object(jsonl_file=malware_json_path, label=1, vocab=vocabulary, save_path=save_vocab_file,
+    #                              file_type=file_type[0])
     parse_json_list_2_pyg_object(jsonl_file=benign_json_path, label=0, vocab=vocabulary, save_path=save_vocab_file,
                                  file_type=file_type[1])
diff --git a/src/DistTrainModel.py b/src/DistTrainModel.py
index e9ce468..1b2959e 100644
--- a/src/DistTrainModel.py
+++ b/src/DistTrainModel.py
@@ -16,7 +16,7 @@ from omegaconf import DictConfig
 from prefetch_generator import BackgroundGenerator
 from sklearn.metrics import roc_auc_score, roc_curve
 from torch import nn
-from torch_geometric.loader import DataLoader
+from torch_geometric.data import DataLoader
 from tqdm import tqdm
 
 from models.HierarchicalGraphModel import HierarchicalGraphNeuralNetwork
@@ -122,7 +122,7 @@ def train_one_epoch(local_rank, train_loader, valid_loader, model, criterion, op
 def validate(local_rank, valid_loader, model, criterion, evaluate_flag, distributed, nprocs, original_valid_length, result_file, details):
     model.eval()
     if distributed:
-        local_device = torch.device("cuda", local_rank)
+        local_device = torch.device("cpu", local_rank)
     else:
         local_device = torch.device("cuda")
     
@@ -252,9 +252,18 @@ def main_train_worker(local_rank: int, nprocs: int, train_params: TrainParams, m
         if local_rank == 0:
             write_into(log_result_file, "\n{} start of {}-epoch, init best_auc={}, start time={} {}".format("-" * 50, epoch, best_auc, time_start.strftime("%Y-%m-%d@%H:%M:%S"), "-" * 50))
         
-        smooth_avg_reduced_loss_list, best_auc = train_one_epoch(local_rank=local_rank, train_loader=train_loader, valid_loader=valid_loader, model=model, criterion=criterion,
-                                                                 optimizer=optimizer, nprocs=nprocs, idx_epoch=epoch, best_auc=best_auc, best_model_file=best_model_path,
-                                                                 original_valid_length=ori_valid_length, result_file=log_result_file)
+        smooth_avg_reduced_loss_list, best_auc = train_one_epoch(local_rank=local_rank,
+                                                                 train_loader=train_loader,
+                                                                 valid_loader=valid_loader,
+                                                                 model=model,
+                                                                 criterion=criterion,
+                                                                 optimizer=optimizer,
+                                                                 nprocs=nprocs,
+                                                                 idx_epoch=epoch,
+                                                                 best_auc=best_auc,
+                                                                 best_model_file=best_model_path,
+                                                                 original_valid_length=ori_valid_length,
+                                                                 result_file=log_result_file)
         all_batch_avg_smooth_loss_list.extend(smooth_avg_reduced_loss_list)
         
         # adjust learning rate